Re: [RFC bpf-next 3/7] bpf: implement devtx hook points

Simon Horman <simon.horman@xxxxxxxxxxxx> · Tue, 13 Jun 2023 17:08:06 +0200

On Mon, Jun 12, 2023 at 10:23:03AM -0700, Stanislav Fomichev wrote:
> devtx is a lightweight set of hooks before and after packet transmission.
> The hook is supposed to work for both skb and xdp paths by exposing
> a light-weight packet wrapper via devtx_frame (header portion + frags).
> 
> devtx is implemented as a tracing program which has access to the
> XDP-metadata-like kfuncs. The initial set of kfuncs is implemented
> in the next patch, but the idea is similar to XDP metadata:
> the kfuncs have netdev-specific implementation, but common
> interface. Upon loading, the kfuncs are resolved to direct
> calls against per-netdev implementation. This can be achieved
> by marking devtx-tracing programs as dev-bound (largely
> reusing xdp-dev-bound program infrastructure).
> 
> Attachment and detachment is implemented via syscall BPF program
> by calling bpf_devtx_sb_attach (attach to tx-submission)
> or bpf_devtx_cp_attach (attach to tx completion). Right now,
> the attachment does not return a link and doesn't support
> multiple programs. I plan to switch to Daniel's bpf_mprog infra
> once it's available.
> 
> Cc: netdev@xxxxxxxxxxxxxxx
> Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>

...

> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -22976,11 +22976,13 @@ L:	bpf@xxxxxxxxxxxxxxx
>  S:	Supported
>  F:	drivers/net/ethernet/*/*/*/*/*xdp*
>  F:	drivers/net/ethernet/*/*/*xdp*
> +F:	include/net/devtx.h
>  F:	include/net/xdp.h
>  F:	include/net/xdp_priv.h
>  F:	include/trace/events/xdp.h
>  F:	kernel/bpf/cpumap.c
>  F:	kernel/bpf/devmap.c
> +F:	net/core/devtx.c
>  F:	net/core/xdp.c
>  F:	samples/bpf/xdp*
>  F:	tools/testing/selftests/bpf/*/*xdp*

Hi Stan,

some feedback from my side.

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 08fbd4622ccf..e08e3fd39dfc 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2238,6 +2238,8 @@ struct net_device {
>  	unsigned int		real_num_rx_queues;
>  
>  	struct bpf_prog __rcu	*xdp_prog;
> +	struct bpf_prog	__rcu	*devtx_sb;
> +	struct bpf_prog	__rcu	*devtx_cp;

It would be good to add these new fields to the kernel doc
for struct net_device.

>  	unsigned long		gro_flush_timeout;
>  	int			napi_defer_hard_irqs;
>  #define GRO_LEGACY_MAX_SIZE	65536u
> diff --git a/include/net/devtx.h b/include/net/devtx.h
> new file mode 100644
> index 000000000000..7eab66d0ce80
> --- /dev/null
> +++ b/include/net/devtx.h
> @@ -0,0 +1,76 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __LINUX_NET_DEVTX_H__
> +#define __LINUX_NET_DEVTX_H__
> +
> +#include <linux/jump_label.h>
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <net/xdp.h>
> +
> +struct devtx_frame {
> +	void *data;
> +	u16 len;
> +	struct skb_shared_info *sinfo; /* for frags */
> +};
> +
> +#ifdef CONFIG_NET
> +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx);
> +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx);
> +bool is_devtx_kfunc(u32 kfunc_id);
> +void devtx_shutdown(struct net_device *netdev);
> +
> +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb)
> +{
> +	ctx->data = skb->data;
> +	ctx->len = skb_headlen(skb);
> +	ctx->sinfo = skb_shinfo(skb);
> +}
> +
> +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf)
> +{
> +	ctx->data = xdpf->data;
> +	ctx->len = xdpf->len;
> +	ctx->sinfo = xdp_frame_has_frags(xdpf) ? xdp_get_shared_info_from_frame(xdpf) : NULL;
> +}
> +
> +DECLARE_STATIC_KEY_FALSE(devtx_enabled);
> +
> +static inline bool devtx_submit_enabled(struct net_device *netdev)
> +{
> +	return static_branch_unlikely(&devtx_enabled) &&
> +	       rcu_access_pointer(netdev->devtx_sb);
> +}
> +
> +static inline bool devtx_complete_enabled(struct net_device *netdev)
> +{
> +	return static_branch_unlikely(&devtx_enabled) &&
> +	       rcu_access_pointer(netdev->devtx_cp);
> +}
> +#else
> +static inline void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx)
> +{
> +}
> +
> +static inline void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx)
> +{
> +}
> +
> +static inline bool is_devtx_kfunc(u32 kfunc_id)
> +{
> +	return false;
> +}
> +
> +static inline void devtx_shutdown(struct net_device *netdev)
> +{
> +}
> +
> +static inline void devtx_frame_from_skb(struct devtx_frame *ctx, struct sk_buff *skb)
> +{
> +}
> +
> +static inline void devtx_frame_from_xdp(struct devtx_frame *ctx, struct xdp_frame *xdpf)
> +{
> +}
> +#endif
> +
> +#endif /* __LINUX_NET_DEVTX_H__ */
> diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
> index 235d81f7e0ed..9cfe96422c80 100644
> --- a/kernel/bpf/offload.c
> +++ b/kernel/bpf/offload.c
> @@ -25,6 +25,7 @@
>  #include <linux/rhashtable.h>
>  #include <linux/rtnetlink.h>
>  #include <linux/rwsem.h>
> +#include <net/devtx.h>
>  
>  /* Protects offdevs, members of bpf_offload_netdev and offload members
>   * of all progs.
> @@ -228,6 +229,7 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
>  	int err;
>  
>  	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
> +	    attr->prog_type != BPF_PROG_TYPE_TRACING &&
>  	    attr->prog_type != BPF_PROG_TYPE_XDP)
>  		return -EINVAL;
>  
> @@ -238,6 +240,10 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
>  	    attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
>  		return -EINVAL;
>  
> +	if (attr->prog_type == BPF_PROG_TYPE_TRACING &&
> +	    !is_devtx_kfunc(prog->aux->attach_btf_id))
> +		return -EINVAL;
> +
>  	netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
>  	if (!netdev)
>  		return -EINVAL;
> diff --git a/net/core/Makefile b/net/core/Makefile
> index 8f367813bc68..c1db05ccfac7 100644
> --- a/net/core/Makefile
> +++ b/net/core/Makefile
> @@ -39,4 +39,5 @@ obj-$(CONFIG_FAILOVER) += failover.o
>  obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
>  obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
> +obj-$(CONFIG_BPF_SYSCALL) += devtx.o
>  obj-$(CONFIG_OF)	+= of_net.o
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3393c2f3dbe8..ef0e65e68024 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -150,6 +150,7 @@
>  #include <linux/pm_runtime.h>
>  #include <linux/prandom.h>
>  #include <linux/once_lite.h>
> +#include <net/devtx.h>
>  
>  #include "dev.h"
>  #include "net-sysfs.h"
> @@ -10875,6 +10876,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
>  		dev_shutdown(dev);
>  
>  		dev_xdp_uninstall(dev);
> +		devtx_shutdown(dev);
>  		bpf_dev_bound_netdev_unregister(dev);
>  
>  		netdev_offload_xstats_disable_all(dev);
> diff --git a/net/core/devtx.c b/net/core/devtx.c
> new file mode 100644
> index 000000000000..b7cbc26d1c01
> --- /dev/null
> +++ b/net/core/devtx.c
> @@ -0,0 +1,208 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <net/devtx.h>
> +#include <linux/filter.h>
> +
> +DEFINE_STATIC_KEY_FALSE(devtx_enabled);
> +EXPORT_SYMBOL_GPL(devtx_enabled);
> +
> +static void devtx_run(struct net_device *netdev, struct devtx_frame *ctx, struct bpf_prog **pprog)

Is an __rcu annotation appropriate for prog here?
Also elsewhere in this patch.

> +{
> +	struct bpf_prog *prog;
> +	void *real_ctx[1] = {ctx};
> +
> +	prog = rcu_dereference(*pprog);
> +	if (likely(prog))
> +		bpf_prog_run(prog, real_ctx);
> +}
> +
> +void devtx_submit(struct net_device *netdev, struct devtx_frame *ctx)
> +{
> +	rcu_read_lock();
> +	devtx_run(netdev, ctx, &netdev->devtx_sb);
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(devtx_submit);
> +
> +void devtx_complete(struct net_device *netdev, struct devtx_frame *ctx)
> +{
> +	rcu_read_lock();
> +	devtx_run(netdev, ctx, &netdev->devtx_cp);
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(devtx_complete);
> +
> +/**
> + * devtx_sb - Called for every egress netdev packet

As this is a kernel doc, it would be good to document the ctx parameter here.

> + *
> + * Note: this function is never actually called by the kernel and declared
> + * only to allow loading an attaching appropriate tracepoints.
> + */
> +__weak noinline void devtx_sb(struct devtx_frame *ctx)

I guess this is intentional.
But gcc complains that this is neither static nor is a forward
declaration provided. Likewise for devtx_cp()

> +{
> +}
> +
> +/**
> + * devtx_cp - Called upon egress netdev packet completion

Likewise, here too.

> + *
> + * Note: this function is never actually called by the kernel and declared
> + * only to allow loading an attaching appropriate tracepoints.
> + */
> +__weak noinline void devtx_cp(struct devtx_frame *ctx)
> +{
> +}
> +
> +BTF_SET8_START(bpf_devtx_hook_ids)
> +BTF_ID_FLAGS(func, devtx_sb)
> +BTF_ID_FLAGS(func, devtx_cp)
> +BTF_SET8_END(bpf_devtx_hook_ids)
> +
> +static const struct btf_kfunc_id_set bpf_devtx_hook_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &bpf_devtx_hook_ids,
> +};
> +
> +static DEFINE_MUTEX(devtx_attach_lock);
> +
> +static int __bpf_devtx_detach(struct net_device *netdev, struct bpf_prog **pprog)
> +{

As per my prior comment about *prog and __rcu annotations.
I'm genuinely unsure how the usage of **pprog in this function sits with RCU.

> +	if (!*pprog)
> +		return -EINVAL;
> +	bpf_prog_put(*pprog);
> +	*pprog = NULL;
> +
> +	static_branch_dec(&devtx_enabled);
> +	return 0;
> +}
> +
> +static int __bpf_devtx_attach(struct net_device *netdev, int prog_fd,
> +			      const char *attach_func_name, struct bpf_prog **pprog)
> +{
> +	struct bpf_prog *prog;
> +	int ret = 0;
> +
> +	if (prog_fd < 0)
> +		return __bpf_devtx_detach(netdev, pprog);
> +
> +	if (*pprog)
> +		return -EBUSY;
> +
> +	prog = bpf_prog_get(prog_fd);
> +	if (IS_ERR(prog))
> +		return PTR_ERR(prog);
> +
> +	if (prog->type != BPF_PROG_TYPE_TRACING ||
> +	    prog->expected_attach_type != BPF_TRACE_FENTRY ||
> +	    !bpf_prog_is_dev_bound(prog->aux) ||
> +	    !bpf_offload_dev_match(prog, netdev) ||
> +	    strcmp(prog->aux->attach_func_name, attach_func_name)) {
> +		bpf_prog_put(prog);
> +		return -EINVAL;
> +	}
> +
> +	*pprog = prog;
> +	static_branch_inc(&devtx_enabled);
> +
> +	return ret;
> +}

...