On Tue, Jun 13, 2023 at 8:08 AM Simon Horman <simon.horman@xxxxxxxxxxxx> wrote: > > On Mon, Jun 12, 2023 at 10:23:03AM -0700, Stanislav Fomichev wrote: > > devtx is a lightweight set of hooks before and after packet transmission. > > The hook is supposed to work for both skb and xdp paths by exposing > > a light-weight packet wrapper via devtx_frame (header portion + frags). > > > > devtx is implemented as a tracing program which has access to the > > XDP-metadata-like kfuncs. The initial set of kfuncs is implemented > > in the next patch, but the idea is similar to XDP metadata: > > the kfuncs have netdev-specific implementation, but common > > interface. Upon loading, the kfuncs are resolved to direct > > calls against per-netdev implementation. This can be achieved > > by marking devtx-tracing programs as dev-bound (largely > > reusing xdp-dev-bound program infrastructure). > > > > Attachment and detachment is implemented via syscall BPF program > > by calling bpf_devtx_sb_attach (attach to tx-submission) > > or bpf_devtx_cp_attach (attach to tx completion). Right now, > > the attachment does not return a link and doesn't support > > multiple programs. I plan to switch to Daniel's bpf_mprog infra > > once it's available. > > > > Cc: netdev@xxxxxxxxxxxxxxx > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > > ... > > > --- a/MAINTAINERS > > +++ b/MAINTAINERS > > @@ -22976,11 +22976,13 @@ L: bpf@xxxxxxxxxxxxxxx > > S: Supported > > F: drivers/net/ethernet/*/*/*/*/*xdp* > > F: drivers/net/ethernet/*/*/*xdp* > > +F: include/net/devtx.h > > F: include/net/xdp.h > > F: include/net/xdp_priv.h > > F: include/trace/events/xdp.h > > F: kernel/bpf/cpumap.c > > F: kernel/bpf/devmap.c > > +F: net/core/devtx.c > > F: net/core/xdp.c > > F: samples/bpf/xdp* > > F: tools/testing/selftests/bpf/*/*xdp* > > Hi Stan, > > some feedback from my side. > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index 08fbd4622ccf..e08e3fd39dfc 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -2238,6 +2238,8 @@ struct net_device { > > unsigned int real_num_rx_queues; > > > > struct bpf_prog __rcu *xdp_prog; > > + struct bpf_prog __rcu *devtx_sb; > > + struct bpf_prog __rcu *devtx_cp; > > It would be good to add these new fields to the kernel doc > for struct net_device. Sure, will do! > > unsigned long gro_flush_timeout; > > int napi_defer_hard_irqs; > > #define GRO_LEGACY_MAX_SIZE 65536u > > diff --git a/include/net/devtx.h b/include/net/devtx.h > > new file mode 100644 > > index 000000000000..7eab66d0ce80 > > --- /dev/null > > +++ b/include/net/devtx.h > > @@ -0,0 +1,76 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +#ifndef __LINUX_NET_DEVTX_H__ > > +#define __LINUX_NET_DEVTX_H__ > > + > > +#include <linux/jump_label.h> > > +#include <linux/skbuff.h> > > +#include <linux/netdevice.h> > > +#include <net/xdp.h> > > + > > +struct devtx_frame { > > + void *data; > > + u16 len; > > + struct skb_shared_info *sinfo; /* for frags */ > > +}; > > + > > +#ifdef CONFIG_NET > > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx); > > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx); > > +bool is_devtx_kfunc(u32 kfunc_id); > > +void devtx_shutdown(struct net_device *netdev); > > + > > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb) > > +{ > > + ctx->data = skb->data; > > + ctx->len = skb_headlen(skb); > > + ctx->sinfo = skb_shinfo(skb); > > +} > > + > > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf) > > +{ > > + ctx->data = xdpf->data; > > + ctx->len = xdpf->len; > > + ctx->sinfo = xdp_frame_has_frags(xdpf) ? xdp_get_shared_info_from_frame(xdpf) : NULL; > > +} > > + > > +DECLARE_STATIC_KEY_FALSE(devtx_enabled); > > + > > +static inline bool devtx_submit_enabled(struct net_device *netdev) > > +{ > > + return static_branch_unlikely(&devtx_enabled) && > > + rcu_access_pointer(netdev->devtx_sb); > > +} > > + > > +static inline bool devtx_complete_enabled(struct net_device *netdev) > > +{ > > + return static_branch_unlikely(&devtx_enabled) && > > + rcu_access_pointer(netdev->devtx_cp); > > +} > > +#else > > +static inline void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx) > > +{ > > +} > > + > > +static inline void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx) > > +{ > > +} > > + > > +static inline bool is_devtx_kfunc(u32 kfunc_id) > > +{ > > + return false; > > +} > > + > > +static inline void devtx_shutdown(struct net_device *netdev) > > +{ > > +} > > + > > +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb) > > +{ > > +} > > + > > +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf) > > +{ > > +} > > +#endif > > + > > +#endif /* __LINUX_NET_DEVTX_H__ */ > > diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c > > index 235d81f7e0ed..9cfe96422c80 100644 > > --- a/kernel/bpf/offload.c > > +++ b/kernel/bpf/offload.c > > @@ -25,6 +25,7 @@ > > #include <linux/rhashtable.h> > > #include <linux/rtnetlink.h> > > #include <linux/rwsem.h> > > +#include <net/devtx.h> > > > > /* Protects offdevs, members of bpf_offload_netdev and offload members > > * of all progs. > > @@ -228,6 +229,7 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) > > int err; > > > > if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && > > + attr->prog_type != BPF_PROG_TYPE_TRACING && > > attr->prog_type != BPF_PROG_TYPE_XDP) > > return -EINVAL; > > > > @@ -238,6 +240,10 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) > > attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) > > return -EINVAL; > > > > + if (attr->prog_type == BPF_PROG_TYPE_TRACING && > > + !is_devtx_kfunc(prog->aux->attach_btf_id)) > > + return -EINVAL; > > + > > netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); > > if (!netdev) > > return -EINVAL; > > diff --git a/net/core/Makefile b/net/core/Makefile > > index 8f367813bc68..c1db05ccfac7 100644 > > --- a/net/core/Makefile > > +++ b/net/core/Makefile > > @@ -39,4 +39,5 @@ obj-$(CONFIG_FAILOVER) += failover.o > > obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o > > obj-$(CONFIG_BPF_SYSCALL) += sock_map.o > > obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o > > +obj-$(CONFIG_BPF_SYSCALL) += devtx.o > > obj-$(CONFIG_OF) += of_net.o > > diff --git a/net/core/dev.c b/net/core/dev.c > > index 3393c2f3dbe8..ef0e65e68024 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -150,6 +150,7 @@ > > #include <linux/pm_runtime.h> > > #include <linux/prandom.h> > > #include <linux/once_lite.h> > > +#include <net/devtx.h> > > > > #include "dev.h" > > #include "net-sysfs.h" > > @@ -10875,6 +10876,7 @@ void unregister_netdevice_many_notify(struct list_head *head, > > dev_shutdown(dev); > > > > dev_xdp_uninstall(dev); > > + devtx_shutdown(dev); > > bpf_dev_bound_netdev_unregister(dev); > > > > netdev_offload_xstats_disable_all(dev); > > diff --git a/net/core/devtx.c b/net/core/devtx.c > > new file mode 100644 > > index 000000000000..b7cbc26d1c01 > > --- /dev/null > > +++ b/net/core/devtx.c > > @@ -0,0 +1,208 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > + > > +#include <net/devtx.h> > > +#include <linux/filter.h> > > + > > +DEFINE_STATIC_KEY_FALSE(devtx_enabled); > > +EXPORT_SYMBOL_GPL(devtx_enabled); > > + > > +static void devtx_run(struct net_device *netdev, struct devtx_frame *ctx, struct bpf_prog **pprog) > > Is an __rcu annotation appropriate for prog here? > Also elsewhere in this patch. Good point. Maybe I should rcu_dereference it them somewhere on top. Let me try to find the best place.. > > +{ > > + struct bpf_prog *prog; > > + void *real_ctx[1] = {ctx}; > > + > > + prog = rcu_dereference(*pprog); > > + if (likely(prog)) > > + bpf_prog_run(prog, real_ctx); > > +} > > + > > +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx) > > +{ > > + rcu_read_lock(); > > + devtx_run(netdev, ctx, &netdev->devtx_sb); > > + rcu_read_unlock(); > > +} > > +EXPORT_SYMBOL_GPL(devtx_submit); > > + > > +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx) > > +{ > > + rcu_read_lock(); > > + devtx_run(netdev, ctx, &netdev->devtx_cp); > > + rcu_read_unlock(); > > +} > > +EXPORT_SYMBOL_GPL(devtx_complete); > > + > > +/** > > + * devtx_sb - Called for every egress netdev packet > > As this is a kernel doc, it would be good to document the ctx parameter here. I didn't really find a convincing way to add a comment, I've had the following which I've removed prio to submission: @ctx devtx_frame context But it doesn't seem like it brings anything useful? Or ok to keep it that way? > > + * > > + * Note: this function is never actually called by the kernel and declared > > + * only to allow loading an attaching appropriate tracepoints. > > + */ > > +__weak noinline void devtx_sb(struct devtx_frame *ctx) > > I guess this is intentional. > But gcc complains that this is neither static nor is a forward > declaration provided. Likewise for devtx_cp() Copy-pasted from hid-bpf; let me see if they have a forward decl somewhere.. > > +{ > > +} > > + > > +/** > > + * devtx_cp - Called upon egress netdev packet completion > > Likewise, here too. > > > + * > > + * Note: this function is never actually called by the kernel and declared > > + * only to allow loading an attaching appropriate tracepoints. > > + */ > > +__weak noinline void devtx_cp(struct devtx_frame *ctx) > > +{ > > +} > > + > > +BTF_SET8_START(bpf_devtx_hook_ids) > > +BTF_ID_FLAGS(func, devtx_sb) > > +BTF_ID_FLAGS(func, devtx_cp) > > +BTF_SET8_END(bpf_devtx_hook_ids) > > + > > +static const struct btf_kfunc_id_set bpf_devtx_hook_set = { > > + .owner = THIS_MODULE, > > + .set = &bpf_devtx_hook_ids, > > +}; > > + > > +static DEFINE_MUTEX(devtx_attach_lock); > > + > > +static int __bpf_devtx_detach(struct net_device *netdev, struct bpf_prog **pprog) > > +{ > > As per my prior comment about *prog and __rcu annotations. > I'm genuinely unsure how the usage of **pprog in this function sits with RCU. Yeah, I'm a bit sloppy, let me figure out a proper way to annotate it. > > + if (!*pprog) > > + return -EINVAL; > > + bpf_prog_put(*pprog); > > + *pprog = NULL; > > + > > + static_branch_dec(&devtx_enabled); > > + return 0; > > +} > > + > > +static int __bpf_devtx_attach(struct net_device *netdev, int prog_fd, > > + const char *attach_func_name, struct bpf_prog **pprog) > > +{ > > + struct bpf_prog *prog; > > + int ret = 0; > > + > > + if (prog_fd < 0) > > + return __bpf_devtx_detach(netdev, pprog); > > + > > + if (*pprog) > > + return -EBUSY; > > + > > + prog = bpf_prog_get(prog_fd); > > + if (IS_ERR(prog)) > > + return PTR_ERR(prog); > > + > > + if (prog->type != BPF_PROG_TYPE_TRACING || > > + prog->expected_attach_type != BPF_TRACE_FENTRY || > > + !bpf_prog_is_dev_bound(prog->aux) || > > + !bpf_offload_dev_match(prog, netdev) || > > + strcmp(prog->aux->attach_func_name, attach_func_name)) { > > + bpf_prog_put(prog); > > + return -EINVAL; > > + } > > + > > + *pprog = prog; > > + static_branch_inc(&devtx_enabled); > > + > > + return ret; > > +} > > ...