There are three places where we initialize sockets: - tcp_output:tcp_connect_init - tcp_minisocks:tcp_openreq_init_rwin - syncookies In the first two we already have a call to `tcp_rwnd_init_bpf` and `dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd attribute. We use this value to bring `rcv_ssthresh` up, potentially above the traditional 64KiB. With higher initial `rcv_ssthresh` the receiver will open the receive window more aggresively, which can improve large BDP flows - large throughput and latency. This patch does not cover the syncookies case. Signed-off-by: Marek Majkowski <marek@xxxxxxxxxxxxxx> --- include/linux/tcp.h | 1 + net/ipv4/tcp_minisocks.c | 9 +++++++-- net/ipv4/tcp_output.c | 7 +++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..c7a8c71536f8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -164,6 +164,7 @@ struct tcp_request_sock { * FastOpen it's the seq# * after data-in-SYN. */ + u32 rcv_ssthresh; u8 syn_tos; }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb95d88497ae..8e5a3bd9a55b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -355,11 +355,13 @@ void tcp_openreq_init_rwin(struct request_sock *req, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; + int adj_mss; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); @@ -377,16 +379,19 @@ void tcp_openreq_init_rwin(struct request_sock *req, rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; + adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + adj_mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; + treq->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss); } EXPORT_SYMBOL(tcp_openreq_init_rwin); @@ -502,7 +507,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_ssthresh = treq->rcv_ssthresh; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 78b654ff421b..56f22d5da3a7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3649,6 +3649,7 @@ static void tcp_connect_init(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; + u32 adj_mss; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -3686,8 +3687,10 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); + adj_mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ? + tp->tcp_header_len - sizeof(struct tcphdr) : 0); tcp_select_initial_window(sk, tcp_full_space(sk), - tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + adj_mss, &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), @@ -3695,7 +3698,7 @@ static void tcp_connect_init(struct sock *sk) rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; - tp->rcv_ssthresh = tp->rcv_wnd; + tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss); sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); -- 2.25.1