TCP pingpong threshold is 1 by default. But some applications, like SQL DB may prefer a higher pingpong threshold to activate delayed acks in quick ack mode for better performance. The pingpong threshold and related code were changed to 3 in the year 2019, and reverted to 1 in the year 2022. There is no single value that fits all applications. Add net.core.tcp_pingpong_thresh sysctl tunable, so it can be tuned for optimal performance based on the application needs. Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> --- Documentation/admin-guide/sysctl/net.rst | 8 ++++++++ include/net/inet_connection_sock.h | 14 +++++++++++--- net/core/sysctl_net_core.c | 9 +++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_output.c | 17 +++++++++++++++-- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 4877563241f3..16f54be9461f 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -413,6 +413,14 @@ historical importance. Default: 0 +tcp_pingpong_thresh +------------------- + +TCP pingpong threshold is 1 by default, but some application may need a higher +threshold for optimal performance. + +Default: 1, min: 1, max: 3 + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ---------------------------------------------------------- diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..e84e33ddae49 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -324,11 +324,11 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); -#define TCP_PINGPONG_THRESH 1 +extern int tcp_pingpong_thresh; static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { - inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH; + inet_csk(sk)->icsk_ack.pingpong = tcp_pingpong_thresh; } static inline void inet_csk_exit_pingpong_mode(struct sock *sk) @@ -338,7 +338,15 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk) static inline bool inet_csk_in_pingpong_mode(struct sock *sk) { - return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; + return inet_csk(sk)->icsk_ack.pingpong >= tcp_pingpong_thresh; +} + +static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ack.pingpong < U8_MAX) + icsk->icsk_ack.pingpong++; } static inline bool inet_csk_has_ulp(struct sock *sk) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 782273bb93c2..b5253567f2bd 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -653,6 +653,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, + { + .procname = "tcp_pingpong_thresh", + .data = &tcp_pingpong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53b7751b68e1..dcd143193d41 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -308,6 +308,8 @@ EXPORT_SYMBOL(tcp_have_smc); struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated); +int tcp_pingpong_thresh __read_mostly = 1; + /* * TCP splice context */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cfe128b81a01..576d21621778 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -167,12 +167,25 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); + /* If tcp_pingpong_thresh > 1, and + * this is the first data packet sent in response to the + * previous received data, + * and it is a reply for ato after last received packet, + * increase pingpong count. + */ + if (tcp_pingpong_thresh > 1 && + before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && + (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_inc_pingpong_cnt(sk); + tp->lsndtime = now; - /* If it is a reply for ato after last received + /* If tcp_pingpong_thresh == 1, and + * it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + if (tcp_pingpong_thresh == 1 && + (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_enter_pingpong_mode(sk); } -- 2.25.1