Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> writes: > The XDP redirect process is two staged: > - bpf_prog_run_xdp() is invoked to run a eBPF program which inspects the > packet and makes decisions. While doing that, the per-CPU variable > bpf_redirect_info is used. > > - Afterwards xdp_do_redirect() is invoked and accesses bpf_redirect_info > and it may also access other per-CPU variables like xskmap_flush_list. > > At the very end of the NAPI callback, xdp_do_flush() is invoked which > does not access bpf_redirect_info but will touch the individual per-CPU > lists. > > The per-CPU variables are only used in the NAPI callback hence disabling > bottom halves is the only protection mechanism. Users from preemptible > context (like cpu_map_kthread_run()) explicitly disable bottom halves > for protections reasons. > Without locking in local_bh_disable() on PREEMPT_RT this data structure > requires explicit locking. > > PREEMPT_RT has forced-threaded interrupts enabled and every > NAPI-callback runs in a thread. If each thread has its own data > structure then locking can be avoided. > > Create a struct bpf_net_context which contains struct bpf_redirect_info. > Define the variable on stack, use bpf_net_ctx_set() to save a pointer to > it. Use the __free() annotation to automatically reset the pointer once > function returns. > The bpf_net_ctx_set() may nest. For instance a function can be used from > within NET_RX_SOFTIRQ/ net_rx_action which uses bpf_net_ctx_set() and > NET_TX_SOFTIRQ which does not. Therefore only the first invocations > updates the pointer. > Use bpf_net_ctx_get_ri() as a wrapper to retrieve the current struct > bpf_redirect_info. > > On PREEMPT_RT the pointer to bpf_net_context is saved task's > task_struct. On non-PREEMPT_RT builds the pointer saved in a per-CPU > variable (which is always NODE-local memory). Using always the > bpf_net_context approach has the advantage that there is almost zero > differences between PREEMPT_RT and non-PREEMPT_RT builds. > > Cc: Alexei Starovoitov <ast@xxxxxxxxxx> > Cc: Andrii Nakryiko <andrii@xxxxxxxxxx> > Cc: Eduard Zingerman <eddyz87@xxxxxxxxx> > Cc: Hao Luo <haoluo@xxxxxxxxxx> > Cc: Jesper Dangaard Brouer <hawk@xxxxxxxxxx> > Cc: Jiri Olsa <jolsa@xxxxxxxxxx> > Cc: John Fastabend <john.fastabend@xxxxxxxxx> > Cc: KP Singh <kpsingh@xxxxxxxxxx> > Cc: Martin KaFai Lau <martin.lau@xxxxxxxxx> > Cc: Song Liu <song@xxxxxxxxxx> > Cc: Stanislav Fomichev <sdf@xxxxxxxxxx> > Cc: Toke Høiland-Jørgensen <toke@xxxxxxxxxx> > Cc: Yonghong Song <yonghong.song@xxxxxxxxx> > Cc: bpf@xxxxxxxxxxxxxxx > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> > --- > include/linux/filter.h | 42 ++++++++++++++++++++++++++++++++----- > include/linux/sched.h | 3 +++ > kernel/bpf/cpumap.c | 3 +++ > kernel/fork.c | 1 + > net/bpf/test_run.c | 11 +++++++++- > net/core/dev.c | 19 ++++++++++++++++- > net/core/filter.c | 47 +++++++++++++++++++----------------------- > net/core/lwt_bpf.c | 3 +++ > 8 files changed, 96 insertions(+), 33 deletions(-) > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index d5fea03cb6e61..6db5a68db6ee1 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -744,7 +744,39 @@ struct bpf_redirect_info { > struct bpf_nh_params nh; > }; > > -DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); > +struct bpf_net_context { > + struct bpf_redirect_info ri; > +}; > + > +static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) > +{ > + struct task_struct *tsk = current; > + > + if (tsk->bpf_net_context != NULL) > + return NULL; > + tsk->bpf_net_context = bpf_net_ctx; > + return bpf_net_ctx; > +} > + > +static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx) > +{ > + if (bpf_net_ctx) > + current->bpf_net_context = NULL; > +} > + > +static inline struct bpf_net_context *bpf_net_ctx_get(void) > +{ > + return current->bpf_net_context; > +} > + > +static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) > +{ > + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); > + > + return &bpf_net_ctx->ri; > +} > + > +DEFINE_FREE(bpf_net_ctx_clear, struct bpf_net_context *, bpf_net_ctx_clear(_T)); > > /* flags for bpf_redirect_info kern_flags */ > #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ > @@ -1021,21 +1053,21 @@ void bpf_clear_redirect_map(struct bpf_map *map); > > static inline bool xdp_return_frame_no_direct(void) > { > - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); > > return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; > } > > static inline void xdp_set_return_frame_no_direct(void) > { > - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); > > ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; > } > > static inline void xdp_clear_return_frame_no_direct(void) > { > - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); > > ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; > } > @@ -1591,7 +1623,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde > u64 flags, const u64 flag_mask, > void *lookup_elem(struct bpf_map *map, u32 key)) > { > - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); > const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; > > /* Lower bits of the flags are used as return code on lookup failure */ > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 6779d3b8f2578..cc9be45de6606 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -53,6 +53,7 @@ struct bio_list; > struct blk_plug; > struct bpf_local_storage; > struct bpf_run_ctx; > +struct bpf_net_context; > struct capture_control; > struct cfs_rq; > struct fs_struct; > @@ -1504,6 +1505,8 @@ struct task_struct { > /* Used for BPF run context */ > struct bpf_run_ctx *bpf_ctx; > #endif > + /* Used by BPF for per-TASK xdp storage */ > + struct bpf_net_context *bpf_net_context; Okay, so if we are going the route of always putting this in 'current', why not just embed the whole struct bpf_net_context inside task_struct, instead of mucking about with the stack-allocated structures and setting/clearing of pointers? -Toke