From 773a7002131393ac22c6e7a3720e374f9e6016ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 03:21:45 +0000 Subject: [PATCH 1/3] tcp: mark tcp_process_tlp_ack() as unlikely It is unlikely we have to call tcp_process_tlp_ack(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260127032147.3498272-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9e91ddbc6253..f2fafba95705 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4283,7 +4283,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, flag); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { @@ -4333,7 +4333,7 @@ no_queue: */ tcp_ack_probe(sk); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); return 1; From 629a68865abb40c120e3f6498b26a35c40590ea0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 03:21:46 +0000 Subject: [PATCH 2/3] tcp: move tcp_rack_update_reo_wnd() to tcp_input.c tcp_rack_update_reo_wnd() is called only once from tcp_ack() Move it to tcp_input.c so that it can be inlined by the compiler to save space and cpu cycles. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 1/0 up/down: 110/-153 (-43) Function old new delta tcp_ack 5631 5741 +110 __pfx_tcp_rack_update_reo_wnd 16 - -16 tcp_rack_update_reo_wnd 137 - -137 Total: Before=22572723, After=22572680, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260127032147.3498272-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 43 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_recovery.c | 43 ----------------------------------------- 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index efff433de9a4..a4d9f263ea68 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2517,7 +2517,6 @@ extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); -extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* tcp_plb.c */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f2fafba95705..d504a9a9b6ec 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4149,6 +4149,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, return delivered; } +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c52fd3254b6e..40732b84771e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -166,49 +166,6 @@ void tcp_rack_reo_timeout(struct sock *sk) tcp_rearm_rto(sk); } -/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. - * - * If a DSACK is received that seems like it may have been due to reordering - * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded - * by srtt), since there is possibility that spurious retransmission was - * due to reordering delay longer than reo_wnd. - * - * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) - * no. of successful recoveries (accounts for full DSACK-based loss - * recovery undo). After that, reset it to default (min_rtt/4). - * - * At max, reo_wnd is incremented only once per rtt. So that the new - * DSACK on which we are reacting, is due to the spurious retx (approx) - * after the reo_wnd has been updated last time. - * - * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than - * absolute value to account for change in rtt. - */ -void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_STATIC_REO_WND) || - !rs->prior_delivered) - return; - - /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ - if (before(rs->prior_delivered, tp->rack.last_delivered)) - tp->rack.dsack_seen = 0; - - /* Adjust the reo_wnd if update is pending */ - if (tp->rack.dsack_seen) { - tp->rack.reo_wnd_steps = min_t(u32, 0xFF, - tp->rack.reo_wnd_steps + 1); - tp->rack.dsack_seen = 0; - tp->rack.last_delivered = tp->delivered; - tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; - } else if (!tp->rack.reo_wnd_persist) { - tp->rack.reo_wnd_steps = 1; - } -} - /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery From d5fb143dbe8d3050c9abcd390d65928e2a3e646e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 03:21:47 +0000 Subject: [PATCH 3/3] tcp: move tcp_rack_advance() to tcp_input.c tcp_rack_advance() is called from tcp_ack() and tcp_sacktag_one(). Moving it to tcp_input.c allows the compiler to inline it and save both space and cpu cycles in TCP fast path. $ scripts/bloat-o-meter -t vmlinux.1 vmlinux.2 add/remove: 0/2 grow/shrink: 1/1 up/down: 98/-132 (-34) Function old new delta tcp_ack 5741 5839 +98 tcp_sacktag_one 407 395 -12 __pfx_tcp_rack_advance 16 - -16 tcp_rack_advance 104 - -104 Total: Before=22572680, After=22572646, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260127032147.3498272-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 -- net/ipv4/tcp_input.c | 32 ++++++++++++++++++++++++++++++++ net/ipv4/tcp_recovery.c | 32 -------------------------------- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a4d9f263ea68..f1cf9e6730c8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2514,8 +2514,6 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); -extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); /* tcp_plb.c */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d504a9a9b6ec..a2a872382fc0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1558,6 +1558,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, + u32 end_seq, u64 xmit_time) +{ + u32 rtt_us; + + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + return; + } + tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } +} + /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 40732b84771e..139646751073 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk) return !!timeout; } -/* Record the most recently (re)sent time among the (s)acked packets - * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from - * draft-cheng-tcpm-rack-00.txt - */ -void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time) -{ - u32 rtt_us; - - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { - /* If the sacked packet was retransmitted, it's ambiguous - * whether the retransmission or the original (or the prior - * retransmission) was sacked. - * - * If the original is lost, there is no ambiguity. Otherwise - * we assume the original can be delayed up to aRTT + min_rtt. - * the aRTT term is bounded by the fast recovery or timeout, - * so it's at least one RTT (i.e., retransmission is at least - * an RTT later). - */ - return; - } - tp->rack.advanced = 1; - tp->rack.rtt_us = rtt_us; - if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) { - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; - } -} - /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */