On 7/10/20 4:49 PM, Andrii Nakryiko wrote: > Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through > BPF_LINK_CREATE command. > > bpf_xdp_link is mutually exclusive with direct BPF program attachment, > previous BPF program should be detached prior to attempting to create a new > bpf_xdp_link attachment (for a given XDP mode). Once link is attached, it > can't be replaced by other BPF program attachment or link attachment. It will > be detached only when the last BPF link FD is closed. > > bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to > how other BPF links behave (cgroup, flow_dissector). At that point bpf_link > will become defunct, but won't be destroyed until last FD is closed. > > Signed-off-by: Andrii Nakryiko <andriin@xxxxxx> > --- > include/linux/netdevice.h | 6 + > include/uapi/linux/bpf.h | 7 +- > kernel/bpf/syscall.c | 5 + > net/core/dev.c | 385 ++++++++++++++++++++++++++++---------- That's big diff for 1 patch. A fair bit of is refactoring / code movement that can be done in a separate refactoring patch making it cleaer what changes you need for the bpf_link piece. > 4 files changed, 301 insertions(+), 102 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index d5630e535836..93bcd81d645d 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -886,6 +886,7 @@ struct bpf_prog_offload_ops; > struct netlink_ext_ack; > struct xdp_umem; > struct xdp_dev_bulk_queue; > +struct bpf_xdp_link; > > enum bpf_xdp_mode { > XDP_MODE_SKB = 0, > @@ -896,6 +897,7 @@ enum bpf_xdp_mode { > > struct bpf_xdp_entity { > struct bpf_prog *prog; > + struct bpf_xdp_link *link; > }; > > struct netdev_bpf { > @@ -3824,6 +3826,10 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); > int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, > int fd, int expected_fd, u32 flags); > u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); > + > +struct bpf_xdp_link; already stated above. > +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > + > int xdp_umem_query(struct net_device *dev, u16 queue_id); > > int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 548a749aebb3..41eba148217b 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -227,6 +227,7 @@ enum bpf_attach_type { > BPF_CGROUP_INET6_GETSOCKNAME, > BPF_XDP_DEVMAP, > BPF_CGROUP_INET_SOCK_RELEASE, > + BPF_XDP, This really does not add value for the uapi. The link_type uniquely identifies the type and the expected program type. > __MAX_BPF_ATTACH_TYPE > }; > > @@ -239,6 +240,7 @@ enum bpf_link_type { > BPF_LINK_TYPE_CGROUP = 3, > BPF_LINK_TYPE_ITER = 4, > BPF_LINK_TYPE_NETNS = 5, > + BPF_LINK_TYPE_XDP = 6, > > MAX_BPF_LINK_TYPE, > }; > @@ -604,7 +606,10 @@ union bpf_attr { > > struct { /* struct used by BPF_LINK_CREATE command */ > __u32 prog_fd; /* eBPF program to attach */ > - __u32 target_fd; /* object to attach to */ > + union { > + __u32 target_fd; /* object to attach to */ > + __u32 target_ifindex; /* target ifindex */ > + }; > __u32 attach_type; /* attach type */ > __u32 flags; /* extra flags */ > } link_create; > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 156f51ffada2..eb4ed4b29418 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2817,6 +2817,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) > return BPF_PROG_TYPE_CGROUP_SOCKOPT; > case BPF_TRACE_ITER: > return BPF_PROG_TYPE_TRACING; > + case BPF_XDP: > + return BPF_PROG_TYPE_XDP; > default: > return BPF_PROG_TYPE_UNSPEC; > } > @@ -3890,6 +3892,9 @@ static int link_create(union bpf_attr *attr) > case BPF_PROG_TYPE_FLOW_DISSECTOR: > ret = netns_bpf_link_create(attr, prog); > break; > + case BPF_PROG_TYPE_XDP: > + ret = bpf_xdp_link_attach(attr, prog); > + break; > default: > ret = -EINVAL; > } > diff --git a/net/core/dev.c b/net/core/dev.c > index d3b82b664e2d..84f755a1ec36 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -8713,8 +8713,47 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) > } > EXPORT_SYMBOL(dev_change_proto_down_generic); > > -static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) > +struct bpf_xdp_link { > + struct bpf_link link; > + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ > + int flags; > +}; > + > +static enum bpf_xdp_mode dev_xdp_mode(u32 flags) > +{ > + if (flags & XDP_FLAGS_HW_MODE) > + return XDP_MODE_HW; > + if (flags & XDP_FLAGS_DRV_MODE) > + return XDP_MODE_DRV; > + return XDP_MODE_SKB; > +} > + > +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) > { > + switch (mode) { > + case XDP_MODE_SKB: > + return generic_xdp_install; > + case XDP_MODE_DRV: > + case XDP_MODE_HW: > + return dev->netdev_ops->ndo_bpf; > + default: > + return NULL; > + }; > +} > + > +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, > + enum bpf_xdp_mode mode) > +{ > + return dev->xdp_state[mode].link; > +} > + > +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, > + enum bpf_xdp_mode mode) > +{ > + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); > + > + if (link) > + return link->link.prog; > return dev->xdp_state[mode].prog; > } > > @@ -8725,9 +8764,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) > return prog ? prog->aux->id : 0; > } > > +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, > + struct bpf_xdp_link *link) > +{ > + dev->xdp_state[mode].link = link; > + dev->xdp_state[mode].prog = NULL; > +} > + > static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, > struct bpf_prog *prog) > { > + dev->xdp_state[mode].link = NULL; > dev->xdp_state[mode].prog = prog; > } > > @@ -8744,6 +8791,14 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, > xdp.flags = flags; > xdp.prog = prog; > > + /* Drivers assume refcnt is already incremented (i.e, prog pointer is > + * "moved" into driver), so they don't increment it on their own, but > + * they do decrement refcnt when program is detached or replaced. > + * Given net_device also owns link/prog, we need to bump refcnt here > + * to prevent drivers from underflowing it. > + */ > + if (prog) > + bpf_prog_inc(prog); Why is this refcnt bump not needed today but is needed for your change? > err = bpf_op(dev, &xdp); > if (err) > return err; and the error path is not decrementing it. > @@ -8756,39 +8811,221 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, > > static void dev_xdp_uninstall(struct net_device *dev) > { > - bpf_op_t ndo_bpf; > + struct bpf_xdp_link *link; > + struct bpf_prog *prog; > + enum bpf_xdp_mode mode; > + bpf_op_t bpf_op; > > - /* Remove generic XDP */ > - WARN_ON(dev_xdp_install(dev, XDP_MODE_SKB, generic_xdp_install, > - NULL, 0, NULL)); > - dev_xdp_set_prog(dev, XDP_MODE_SKB, NULL); > + ASSERT_RTNL(); > > - /* Remove from the driver */ > - ndo_bpf = dev->netdev_ops->ndo_bpf; > - if (!ndo_bpf) > - return; > + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { > + prog = dev_xdp_prog(dev, mode); > + if (!prog) > + continue; > + > + bpf_op = dev_xdp_bpf_op(dev, mode); > + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); > + > + /* auto-detach link from net device */ > + link = dev_xdp_link(dev, mode); > + if (link) > + link->dev = NULL; > + else > + bpf_prog_put(prog); > + > + dev_xdp_set_link(dev, mode, NULL); > + } > +} > + > +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, > + struct bpf_xdp_link *link, struct bpf_prog *new_prog, > + struct bpf_prog *old_prog, u32 flags) > +{ > + struct bpf_prog *cur_prog; > + enum bpf_xdp_mode mode; > + bpf_op_t bpf_op; > + int err; > + > + ASSERT_RTNL(); > > - if (dev_xdp_prog_id(dev, XDP_MODE_DRV)) { > - WARN_ON(dev_xdp_install(dev, XDP_MODE_DRV, ndo_bpf, > - NULL, 0, NULL)); > - dev_xdp_set_prog(dev, XDP_MODE_DRV, NULL); > + /* link supports only XDP mode flags */ > + if (link && (flags & ~XDP_FLAGS_MODES)) > + return -EINVAL; everyone of the -errno returns needs an extack message explaining why