From: Björn Töpel <bjorn.topel@xxxxxxxxx> This change adds a new NAPI mode, called biased busy-polling, which is an extension to the existing busy-polling mode. The new mode is enabled on the socket layer, where a socket setting this option "promisies" to busy-poll the NAPI context via a system call. When this mode is enabled, the NAPI context will operate in a mode with interrupts disabled. The kernel monitors that the busy-polling promise is fulfilled by an internal watchdog. If the socket fail/stop performing the busy-polling, the mode will be disabled. The watchdog is currently 200 ms. Biased busy-polling follows the same mechanism as the existing busy-poll; The napi_id is reported to the socket via the skbuff. Later commits will extend napi_id reporting to XDP, in order to work correctly with XDP sockets. Let us walk through a flow of execution: 1. A socket sets the new SO_BIAS_BUSY_POLL socket option to true. The socket now shows an intent of doing busy-polling. No data has been received to the socket, so the napi_id of the socket is still 0 (non-valid). As usual for busy-polling, the SO_BUSY_POLL option also has to be non-zero for biased busy-polling. 2. Data is received on the socket changing the napi_id to non-zero. 3. The socket does a system call that has the busy-polling logic wired up, e.g. recvfrom() for UDP sockets. The NAPI context is now marked as biased busy-poll. The kernel watchdog is armed. If the NAPI context is already running, it will try to finish as soon as possible and move to busy-polling. If the NAPI context is not running, it will execute the NAPI poll function for the corresponding napi_id. 4. Goto 3, or wait until the watchdog timeout. Given the nature of busy-polling, this mode only make sense for non-blocking sockets. When the NAPI context is in biased busy-polling mode, it will not allow a NAPI to be scheduled using the napi_schedule_prep()/napi_scheduleXXX() combo. Signed-off-by: Björn Töpel <bjorn.topel@xxxxxxxxx> --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + include/linux/netdevice.h | 33 +++++----- include/net/busy_poll.h | 17 ++++- include/net/sock.h | 3 + include/uapi/asm-generic/socket.h | 2 + net/core/dev.c | 89 +++++++++++++++++++++++++-- net/core/sock.c | 9 +++ 10 files changed, 140 insertions(+), 21 deletions(-) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df61082..0f776668fb09 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_BIAS_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6..d23984731504 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_BIAS_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e..49469713ed2a 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 +#define SO_BIAS_BUSY_POLL 0x4043 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7c..009aba6f7a54 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 +#define SO_BIAS_BUSY_POLL 0x0048 + #if !defined(__KERNEL__) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 964b494b0e8d..9bdc84d3d6b8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -344,29 +344,32 @@ struct napi_struct { struct list_head rx_list; /* Pending GRO_NORMAL skbs */ int rx_count; /* length of rx_list */ struct hrtimer timer; + struct hrtimer bp_watchdog; struct list_head dev_list; struct hlist_node napi_hash_node; unsigned int napi_id; }; enum { - NAPI_STATE_SCHED, /* Poll is scheduled */ - NAPI_STATE_MISSED, /* reschedule a napi */ - NAPI_STATE_DISABLE, /* Disable pending */ - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_LISTED, /* NAPI added to system lists */ - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ + NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_BIAS_BUSY_POLL, /* biased busy-polling */ }; enum { - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_BIAS_BUSY_POLL = BIT(NAPI_STATE_BIAS_BUSY_POLL), }; enum gro_result { @@ -555,6 +558,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) return true; } +void napi_bias_busy_poll(unsigned int napi_id); + enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..9738923ed17b 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -23,6 +23,9 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +/* Biased busy-poll watchdog timeout in ms */ +#define BIASED_BUSY_POLL_TIMEOUT 200 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -99,13 +102,25 @@ static inline bool sk_busy_loop_timeout(struct sock *sk, return true; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static inline void __sk_bias_busy_poll(struct sock *sk, unsigned int napi_id) +{ + if (likely(!READ_ONCE(sk->sk_bias_busy_poll))) + return; + + napi_bias_busy_poll(napi_id); +} +#endif + static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id >= MIN_NAPI_ID) + if (napi_id >= MIN_NAPI_ID) { + __sk_bias_busy_poll(sk, napi_id); napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + } #endif } diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..cf71834fb601 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -479,6 +479,9 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; +#ifdef CONFIG_NET_RX_BUSY_POLL + u8 sk_bias_busy_poll; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..8a2b37ccd9d5 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_BIAS_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/dev.c b/net/core/dev.c index 9499a414d67e..a29e4c4a35f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6378,6 +6378,9 @@ bool napi_schedule_prep(struct napi_struct *n) val = READ_ONCE(n->state); if (unlikely(val & NAPIF_STATE_DISABLE)) return false; + if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL)) + return false; + new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set @@ -6458,12 +6461,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. - * This C code was suggested by Alexander Duyck to help gcc. */ - new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * - NAPIF_STATE_SCHED; + if (val & NAPIF_STATE_MISSED && !(val & NAPIF_STATE_BIAS_BUSY_POLL)) + new |= NAPIF_STATE_SCHED; } while (cmpxchg(&n->state, val, new) != val); + if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL)) + return false; + if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; @@ -6497,6 +6502,20 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + /* If we're biased towards busy poll, clear the sched flags, + * so that we can enter again. + */ + if (READ_ONCE(napi->state) & NAPIF_STATE_BIAS_BUSY_POLL) { + netpoll_poll_unlock(have_poll_lock); + napi_complete(napi); + __kfree_skb_flush(); + local_bh_enable(); + return; + } + /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). @@ -6507,9 +6526,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); - clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); - - local_bh_disable(); /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. @@ -6569,6 +6585,11 @@ void napi_busy_loop(unsigned int napi_id, goto count; have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; + if (val & NAPIF_STATE_BIAS_BUSY_POLL) { + hrtimer_start(&napi->bp_watchdog, + ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT), + HRTIMER_MODE_REL_PINNED); + } } work = napi_poll(napi, BUSY_POLL_BUDGET); trace_napi_poll(napi, work, BUSY_POLL_BUDGET); @@ -6652,6 +6673,53 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } +static enum hrtimer_restart napi_biased_busy_poll_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + unsigned long val, new; + + napi = container_of(timer, struct napi_struct, bp_watchdog); + + do { + val = READ_ONCE(napi->state); + if (WARN_ON_ONCE(!(val & NAPIF_STATE_BIAS_BUSY_POLL))) + return HRTIMER_NORESTART; + + new = val & ~NAPIF_STATE_BIAS_BUSY_POLL; + } while (cmpxchg(&napi->state, val, new) != val); + + if (!napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); + + return HRTIMER_NORESTART; +} + +void napi_bias_busy_poll(unsigned int napi_id) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct napi_struct *napi; + unsigned long val, new; + + napi = napi_by_id(napi_id); + if (!napi) + return; + + do { + val = READ_ONCE(napi->state); + if (val & NAPIF_STATE_BIAS_BUSY_POLL) + return; + + new = val | NAPIF_STATE_BIAS_BUSY_POLL; + } while (cmpxchg(&napi->state, val, new) != val); + + hrtimer_start(&napi->bp_watchdog, ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT), + HRTIMER_MODE_REL_PINNED); +#endif +} +EXPORT_SYMBOL(napi_bias_busy_poll); + + static void init_gro_hash(struct napi_struct *napi) { int i; @@ -6673,6 +6741,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; + hrtimer_init(&napi->bp_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->bp_watchdog.function = napi_biased_busy_poll_watchdog; init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -6704,7 +6774,9 @@ void napi_disable(struct napi_struct *n) msleep(1); hrtimer_cancel(&n->timer); + hrtimer_cancel(&n->bp_watchdog); + clear_bit(NAPI_STATE_BIAS_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6767,6 +6839,11 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) if (likely(work < weight)) goto out_unlock; + if (unlikely(n->state & NAPIF_STATE_BIAS_BUSY_POLL)) { + napi_complete(n); + goto out_unlock; + } + /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..686eb5549b79 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_ll_usec = val; } break; + case SO_BIAS_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + sk->sk_bias_busy_poll = valbool; + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_BIAS_BUSY_POLL: + v.val = sk->sk_bias_busy_poll; + break; #endif case SO_MAX_PACING_RATE: -- 2.27.0