On Mon, Jun 12, 2023 at 10:23:03AM -0700, Stanislav Fomichev wrote: > devtx is a lightweight set of hooks before and after packet transmission. > The hook is supposed to work for both skb and xdp paths by exposing > a light-weight packet wrapper via devtx_frame (header portion + frags). > > devtx is implemented as a tracing program which has access to the > XDP-metadata-like kfuncs. The initial set of kfuncs is implemented > in the next patch, but the idea is similar to XDP metadata: > the kfuncs have netdev-specific implementation, but common > interface. Upon loading, the kfuncs are resolved to direct > calls against per-netdev implementation. This can be achieved > by marking devtx-tracing programs as dev-bound (largely > reusing xdp-dev-bound program infrastructure). > > Attachment and detachment is implemented via syscall BPF program > by calling bpf_devtx_sb_attach (attach to tx-submission) > or bpf_devtx_cp_attach (attach to tx completion). Right now, > the attachment does not return a link and doesn't support > multiple programs. I plan to switch to Daniel's bpf_mprog infra > once it's available. > > Cc: netdev@xxxxxxxxxxxxxxx > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> ... > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -22976,11 +22976,13 @@ L: bpf@xxxxxxxxxxxxxxx > S: Supported > F: drivers/net/ethernet/*/*/*/*/*xdp* > F: drivers/net/ethernet/*/*/*xdp* > +F: include/net/devtx.h > F: include/net/xdp.h > F: include/net/xdp_priv.h > F: include/trace/events/xdp.h > F: kernel/bpf/cpumap.c > F: kernel/bpf/devmap.c > +F: net/core/devtx.c > F: net/core/xdp.c > F: samples/bpf/xdp* > F: tools/testing/selftests/bpf/*/*xdp* Hi Stan, some feedback from my side. > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 08fbd4622ccf..e08e3fd39dfc 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -2238,6 +2238,8 @@ struct net_device { > unsigned int real_num_rx_queues; > > struct bpf_prog __rcu *xdp_prog; > + struct bpf_prog __rcu *devtx_sb; > + struct bpf_prog __rcu *devtx_cp; It would be good to add these new fields to the kernel doc for struct net_device. > unsigned long gro_flush_timeout; > int napi_defer_hard_irqs; > #define GRO_LEGACY_MAX_SIZE 65536u > diff --git a/include/net/devtx.h b/include/net/devtx.h > new file mode 100644 > index 000000000000..7eab66d0ce80 > --- /dev/null > +++ b/include/net/devtx.h > @@ -0,0 +1,76 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +#ifndef __LINUX_NET_DEVTX_H__ > +#define __LINUX_NET_DEVTX_H__ > + > +#include <linux/jump_label.h> > +#include <linux/skbuff.h> > +#include <linux/netdevice.h> > +#include <net/xdp.h> > + > +struct devtx_frame { > + void *data; > + u16 len; > + struct skb_shared_info *sinfo; /* for frags */ > +}; > + > +#ifdef CONFIG_NET > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx); > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx); > +bool is_devtx_kfunc(u32 kfunc_id); > +void devtx_shutdown(struct net_device *netdev); > + > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb) > +{ > + ctx->data = skb->data; > + ctx->len = skb_headlen(skb); > + ctx->sinfo = skb_shinfo(skb); > +} > + > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf) > +{ > + ctx->data = xdpf->data; > + ctx->len = xdpf->len; > + ctx->sinfo = xdp_frame_has_frags(xdpf) ? xdp_get_shared_info_from_frame(xdpf) : NULL; > +} > + > +DECLARE_STATIC_KEY_FALSE(devtx_enabled); > + > +static inline bool devtx_submit_enabled(struct net_device *netdev) > +{ > + return static_branch_unlikely(&devtx_enabled) && > + rcu_access_pointer(netdev->devtx_sb); > +} > + > +static inline bool devtx_complete_enabled(struct net_device *netdev) > +{ > + return static_branch_unlikely(&devtx_enabled) && > + rcu_access_pointer(netdev->devtx_cp); > +} > +#else > +static inline void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx) > +{ > +} > + > +static inline void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx) > +{ > +} > + > +static inline bool is_devtx_kfunc(u32 kfunc_id) > +{ > + return false; > +} > + > +static inline void devtx_shutdown(struct net_device *netdev) > +{ > +} > + > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb) > +{ > +} > + > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf) > +{ > +} > +#endif > + > +#endif /* __LINUX_NET_DEVTX_H__ */ > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c > index 235d81f7e0ed..9cfe96422c80 100644 > --- a/kernel/bpf/offload.c > +++ b/kernel/bpf/offload.c > @@ -25,6 +25,7 @@ > #include <linux/rhashtable.h> > #include <linux/rtnetlink.h> > #include <linux/rwsem.h> > +#include <net/devtx.h> > > /* Protects offdevs, members of bpf_offload_netdev and offload members > * of all progs. > @@ -228,6 +229,7 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) > int err; > > if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && > + attr->prog_type != BPF_PROG_TYPE_TRACING && > attr->prog_type != BPF_PROG_TYPE_XDP) > return -EINVAL; > > @@ -238,6 +240,10 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) > attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) > return -EINVAL; > > + if (attr->prog_type == BPF_PROG_TYPE_TRACING && > + !is_devtx_kfunc(prog->aux->attach_btf_id)) > + return -EINVAL; > + > netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); > if (!netdev) > return -EINVAL; > diff --git a/net/core/Makefile b/net/core/Makefile > index 8f367813bc68..c1db05ccfac7 100644 > --- a/net/core/Makefile > +++ b/net/core/Makefile > @@ -39,4 +39,5 @@ obj-$(CONFIG_FAILOVER) += failover.o > obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o > obj-$(CONFIG_BPF_SYSCALL) += sock_map.o > obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o > +obj-$(CONFIG_BPF_SYSCALL) += devtx.o > obj-$(CONFIG_OF) += of_net.o > diff --git a/net/core/dev.c b/net/core/dev.c > index 3393c2f3dbe8..ef0e65e68024 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -150,6 +150,7 @@ > #include <linux/pm_runtime.h> > #include <linux/prandom.h> > #include <linux/once_lite.h> > +#include <net/devtx.h> > > #include "dev.h" > #include "net-sysfs.h" > @@ -10875,6 +10876,7 @@ void unregister_netdevice_many_notify(struct list_head *head, > dev_shutdown(dev); > > dev_xdp_uninstall(dev); > + devtx_shutdown(dev); > bpf_dev_bound_netdev_unregister(dev); > > netdev_offload_xstats_disable_all(dev); > diff --git a/net/core/devtx.c b/net/core/devtx.c > new file mode 100644 > index 000000000000..b7cbc26d1c01 > --- /dev/null > +++ b/net/core/devtx.c > @@ -0,0 +1,208 @@ > +// SPDX-License-Identifier: GPL-2.0-only > + > +#include <net/devtx.h> > +#include <linux/filter.h> > + > +DEFINE_STATIC_KEY_FALSE(devtx_enabled); > +EXPORT_SYMBOL_GPL(devtx_enabled); > + > +static void devtx_run(struct net_device *netdev, struct devtx_frame *ctx, struct bpf_prog **pprog) Is an __rcu annotation appropriate for prog here? Also elsewhere in this patch. > +{ > + struct bpf_prog *prog; > + void *real_ctx[1] = {ctx}; > + > + prog = rcu_dereference(*pprog); > + if (likely(prog)) > + bpf_prog_run(prog, real_ctx); > +} > + > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx) > +{ > + rcu_read_lock(); > + devtx_run(netdev, ctx, &netdev->devtx_sb); > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL_GPL(devtx_submit); > + > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx) > +{ > + rcu_read_lock(); > + devtx_run(netdev, ctx, &netdev->devtx_cp); > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL_GPL(devtx_complete); > + > +/** > + * devtx_sb - Called for every egress netdev packet As this is a kernel doc, it would be good to document the ctx parameter here. > + * > + * Note: this function is never actually called by the kernel and declared > + * only to allow loading an attaching appropriate tracepoints. > + */ > +__weak noinline void devtx_sb(struct devtx_frame *ctx) I guess this is intentional. But gcc complains that this is neither static nor is a forward declaration provided. Likewise for devtx_cp() > +{ > +} > + > +/** > + * devtx_cp - Called upon egress netdev packet completion Likewise, here too. > + * > + * Note: this function is never actually called by the kernel and declared > + * only to allow loading an attaching appropriate tracepoints. > + */ > +__weak noinline void devtx_cp(struct devtx_frame *ctx) > +{ > +} > + > +BTF_SET8_START(bpf_devtx_hook_ids) > +BTF_ID_FLAGS(func, devtx_sb) > +BTF_ID_FLAGS(func, devtx_cp) > +BTF_SET8_END(bpf_devtx_hook_ids) > + > +static const struct btf_kfunc_id_set bpf_devtx_hook_set = { > + .owner = THIS_MODULE, > + .set = &bpf_devtx_hook_ids, > +}; > + > +static DEFINE_MUTEX(devtx_attach_lock); > + > +static int __bpf_devtx_detach(struct net_device *netdev, struct bpf_prog **pprog) > +{ As per my prior comment about *prog and __rcu annotations. I'm genuinely unsure how the usage of **pprog in this function sits with RCU. > + if (!*pprog) > + return -EINVAL; > + bpf_prog_put(*pprog); > + *pprog = NULL; > + > + static_branch_dec(&devtx_enabled); > + return 0; > +} > + > +static int __bpf_devtx_attach(struct net_device *netdev, int prog_fd, > + const char *attach_func_name, struct bpf_prog **pprog) > +{ > + struct bpf_prog *prog; > + int ret = 0; > + > + if (prog_fd < 0) > + return __bpf_devtx_detach(netdev, pprog); > + > + if (*pprog) > + return -EBUSY; > + > + prog = bpf_prog_get(prog_fd); > + if (IS_ERR(prog)) > + return PTR_ERR(prog); > + > + if (prog->type != BPF_PROG_TYPE_TRACING || > + prog->expected_attach_type != BPF_TRACE_FENTRY || > + !bpf_prog_is_dev_bound(prog->aux) || > + !bpf_offload_dev_match(prog, netdev) || > + strcmp(prog->aux->attach_func_name, attach_func_name)) { > + bpf_prog_put(prog); > + return -EINVAL; > + } > + > + *pprog = prog; > + static_branch_inc(&devtx_enabled); > + > + return ret; > +} ...