On Mon, 22 Aug 2022 02:12:20 -0700 Peilin Ye <peilin.ye@xxxxxxxxxxxxx> wrote > > Support Qdisc backpressure for UDP (IPv4 and IPv6) sockets by > implementing the (*backpressure) callback: > > 1. When a shaper Qdisc drops a packet due to TC egress congestion, > halve the effective send buffer [1], then (re)scedule the > backpressure timer. > > [1] sndbuf - overlimits_new == 1/2 * (sndbuf - overlimits_old) > > 2. When the timer expires, double the effective send buffer [2]. If > the socket is still overlimit, reschedule the timer itself. > > [2] sndbuf - overlimits_new == 2 * (sndbuf - overlimits_old) > > In sock_wait_for_wmem() and sock_alloc_send_pskb(), check the size of > effective send buffer instead, so that overlimit sockets send slower. > See sk_sndbuf_avail(). Make sense to me. > > The timer interval is specified by a new per-net sysctl, > sysctl_udp_backpressure_interval. Default is 100 milliseconds, meaning > that an overlimit UDP socket will try to double its effective send > buffer every 100 milliseconds. Use 0 to disable Qdisc backpressure for > UDP sockets. > > Generally, longer interval means lower packet drop rate, but also makes > overlimit sockets slower to recover when TC egress becomes idle (or the > shaper Qdisc gets removed, etc.) > > Test results with TBF + SFQ Qdiscs, 500 Mbits/sec rate limit with 16 > iperf UDP '-b 1G' clients: > > Interval Throughput Drop Rate CPU Usage [3] > 0 (disabled) 480.0 Mb/s 96.50% 68.38% > 10 ms 486.4 Mb/s 9.28% 1.30% > 100 ms 486.4 Mb/s 1.10% 1.11% > 1000 ms 486.4 Mb/s 0.13% 0.81% > > [3] perf-top, __pv_queued_spin_lock_slowpath() > > Signed-off-by: Peilin Ye <peilin.ye@xxxxxxxxxxxxx> > --- > Documentation/networking/ip-sysctl.rst | 11 ++++ > include/linux/udp.h | 3 ++ > include/net/netns/ipv4.h | 1 + > include/net/udp.h | 1 + > net/core/sock.c | 4 +- > net/ipv4/sysctl_net_ipv4.c | 7 +++ > net/ipv4/udp.c | 69 +++++++++++++++++++++++++- > net/ipv6/udp.c | 2 +- > 8 files changed, 94 insertions(+), 4 deletions(-) > > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst > index 56cd4ea059b2..a0d8e9518fda 100644 > --- a/Documentation/networking/ip-sysctl.rst > +++ b/Documentation/networking/ip-sysctl.rst > @@ -1070,6 +1070,17 @@ udp_rmem_min - INTEGER > udp_wmem_min - INTEGER > UDP does not have tx memory accounting and this tunable has no effect. > > +udp_backpressure_interval - INTEGER > + The time interval (in milliseconds) in which an overlimit UDP socket > + tries to increase its effective send buffer size, used by Qdisc > + backpressure. A longer interval typically results in a lower packet > + drop rate, but also makes it slower for overlimit UDP sockets to > + recover from backpressure when TC egress becomes idle. > + > + 0 to disable Qdisc backpressure for UDP sockets. > + > + Default: 100 > + > RAW variables > ============= > > diff --git a/include/linux/udp.h b/include/linux/udp.h > index 254a2654400f..dd017994738b 100644 > --- a/include/linux/udp.h > +++ b/include/linux/udp.h > @@ -86,6 +86,9 @@ struct udp_sock { > > /* This field is dirtied by udp_recvmsg() */ > int forward_deficit; > + > + /* Qdisc backpressure timer */ > + struct timer_list backpressure_timer; > }; My $0.02 is s/backpressure_timer/backpressure_ts/ > > #define UDP_MAX_SEGMENTS (1 << 6UL) > diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h > index c7320ef356d9..01f72ddf23e0 100644 > --- a/include/net/netns/ipv4.h > +++ b/include/net/netns/ipv4.h > @@ -182,6 +182,7 @@ struct netns_ipv4 { > > int sysctl_udp_wmem_min; > int sysctl_udp_rmem_min; > + int sysctl_udp_backpressure_interval; > > u8 sysctl_fib_notify_on_flag_change; > > diff --git a/include/net/udp.h b/include/net/udp.h > index 5ee88ddf79c3..82018e58659b 100644 > --- a/include/net/udp.h > +++ b/include/net/udp.h > @@ -279,6 +279,7 @@ int udp_init_sock(struct sock *sk); > int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); > int __udp_disconnect(struct sock *sk, int flags); > int udp_disconnect(struct sock *sk, int flags); > +void udp_backpressure(struct sock *sk); > __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); > struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, > netdev_features_t features, > diff --git a/net/core/sock.c b/net/core/sock.c > index 167d471b176f..cb6ba66f80c8 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2614,7 +2614,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) > break; > set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); > prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); > - if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) > + if (refcount_read(&sk->sk_wmem_alloc) < sk_sndbuf_avail(sk)) > break; > if (sk->sk_shutdown & SEND_SHUTDOWN) > break; > @@ -2649,7 +2649,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, > if (sk->sk_shutdown & SEND_SHUTDOWN) > goto failure; > > - if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) > + if (sk_wmem_alloc_get(sk) < sk_sndbuf_avail(sk)) > break; > > sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > index 5490c285668b..1e509a417b92 100644 > --- a/net/ipv4/sysctl_net_ipv4.c > +++ b/net/ipv4/sysctl_net_ipv4.c > @@ -1337,6 +1337,13 @@ static struct ctl_table ipv4_net_table[] = { > .proc_handler = proc_dointvec_minmax, > .extra1 = SYSCTL_ONE > }, > + { > + .procname = "udp_backpressure_interval", > + .data = &init_net.ipv4.sysctl_udp_backpressure_interval, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec_ms_jiffies, > + }, > { > .procname = "fib_notify_on_flag_change", > .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 34eda973bbf1..ff58f638c834 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -110,6 +110,7 @@ > #include <trace/events/skb.h> > #include <net/busy_poll.h> > #include "udp_impl.h" > +#include <net/sock.h> > #include <net/sock_reuseport.h> > #include <net/addrconf.h> > #include <net/udp_tunnel.h> > @@ -1614,10 +1615,73 @@ void udp_destruct_sock(struct sock *sk) > } > EXPORT_SYMBOL_GPL(udp_destruct_sock); > > +static inline int udp_backpressure_interval_get(struct sock *sk) > +{ > + return READ_ONCE(sock_net(sk)->ipv4.sysctl_udp_backpressure_interval); > +} > + > +static inline void udp_reset_backpressure_timer(struct sock *sk, > + unsigned long expires) > +{ > + sk_reset_timer(sk, &udp_sk(sk)->backpressure_timer, expires); > +} > + > +static void udp_backpressure_timer(struct timer_list *t) > +{ > + struct udp_sock *up = from_timer(up, t, backpressure_timer); > + int interval, sndbuf, overlimits; > + struct sock *sk = &up->inet.sk; > + > + interval = udp_backpressure_interval_get(sk); > + if (!interval) { > + /* Qdisc backpressure has been turned off */ > + WRITE_ONCE(sk->sk_overlimits, 0); > + goto out; > + } > + > + sndbuf = READ_ONCE(sk->sk_sndbuf); > + overlimits = READ_ONCE(sk->sk_overlimits); > + > + /* sndbuf - overlimits_new == 2 * (sndbuf - overlimits_old) */ > + overlimits = min_t(int, overlimits, sndbuf - SOCK_MIN_SNDBUF); > + overlimits = max_t(int, (2 * overlimits) - sndbuf, 0); > + WRITE_ONCE(sk->sk_overlimits, overlimits); > + > + if (overlimits > 0) > + udp_reset_backpressure_timer(sk, jiffies + interval); > + > +out: > + sock_put(sk); > +} > + > +void udp_backpressure(struct sock *sk) > +{ > + int interval, sndbuf, overlimits; > + > + interval = udp_backpressure_interval_get(sk); > + if (!interval) /* Qdisc backpressure is off */ > + return; > + > + sndbuf = READ_ONCE(sk->sk_sndbuf); > + overlimits = READ_ONCE(sk->sk_overlimits); > + > + /* sndbuf - overlimits_new == 1/2 * (sndbuf - overlimits_old) */ > + overlimits = min_t(int, overlimits, sndbuf - SOCK_MIN_SNDBUF); > + overlimits += (sndbuf - overlimits) >> 1; > + WRITE_ONCE(sk->sk_overlimits, overlimits); > + > + if (overlimits > 0) > + udp_reset_backpressure_timer(sk, jiffies + interval); > +} > +EXPORT_SYMBOL_GPL(udp_backpressure); > + 2, add sndbuf callback in addition to backpressure. static int udp_sndbuf(struct sock *sk) { int interval = udp_backpressure_interval_get(sk); unsigned long ts = udp_sk(sk)->backpressure_ts; int sndbuf = READ_ONCE(sk->sk_sndbuf); unsigned long now = jiffies; if (!interval || !time_before(now, interval + ts)) return sndbuf; /* sndbuf = (sndbuf * (now - ts)) / interval; */ if (now - ts > (interval / 2)) sndbuf /= 2; else sndbuf /= 4; return max_t(int, sndbuf, SOCK_MIN_SNDBUF); } 1, sk_sndbuf_avail() should be in sock.h as it is. static inline int sk_sndbuf_avail(struct sock *sk) { if (sk->sk_prot->backpressure) return sk->sk_prot->sndbuf(sk); else return READ_ONCE(sk->sk_sndbuf); } 0, add backpressure callback. static void udp_backpressure(struct sock *sk) { if (udp_backpressure_interval_get(sk)) udp_sk(sk)->backpressure_ts = jiffies; } Only for thoughts now. Hillf > int udp_init_sock(struct sock *sk) > { > - skb_queue_head_init(&udp_sk(sk)->reader_queue); > + struct udp_sock *up = udp_sk(sk); > + > + skb_queue_head_init(&up->reader_queue); > sk->sk_destruct = udp_destruct_sock; > + timer_setup(&up->backpressure_timer, udp_backpressure_timer, 0); > return 0; > } > EXPORT_SYMBOL_GPL(udp_init_sock); > @@ -2653,6 +2717,7 @@ void udp_destroy_sock(struct sock *sk) > /* protects from races with udp_abort() */ > sock_set_flag(sk, SOCK_DEAD); > udp_flush_pending_frames(sk); > + sk_stop_timer(sk, &up->backpressure_timer); > unlock_sock_fast(sk, slow); > if (static_branch_unlikely(&udp_encap_needed_key)) { > if (up->encap_type) { > @@ -2946,6 +3011,7 @@ struct proto udp_prot = { > #ifdef CONFIG_BPF_SYSCALL > .psock_update_sk_prot = udp_bpf_update_proto, > #endif > + .backpressure = udp_backpressure, > .memory_allocated = &udp_memory_allocated, > .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, > > @@ -3268,6 +3334,7 @@ static int __net_init udp_sysctl_init(struct net *net) > { > net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; > net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; > + net->ipv4.sysctl_udp_backpressure_interval = msecs_to_jiffies(100); > > #ifdef CONFIG_NET_L3_MASTER_DEV > net->ipv4.sysctl_udp_l3mdev_accept = 0; > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index 16c176e7c69a..106032af6756 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -1735,7 +1735,7 @@ struct proto udpv6_prot = { > #ifdef CONFIG_BPF_SYSCALL > .psock_update_sk_prot = udp_bpf_update_proto, > #endif > - > + .backpressure = udp_backpressure, > .memory_allocated = &udp_memory_allocated, > .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, > > -- > 2.20.1