Hangbin Liu <liuhangbin@xxxxxxxxx> writes: > This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to > extend xdp_redirect_map for broadcast support. > > With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces > in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be > excluded when do broadcasting. > > When getting the devices in dev hash map via dev_map_hash_get_next_key(), > there is a possibility that we fall back to the first key when a device > was removed. This will duplicate packets on some interfaces. So just walk > the whole buckets to avoid this issue. For dev array map, we also walk the > whole map to find valid interfaces. > > Function bpf_clear_redirect_map() was removed in > commit ee75aef23afe ("bpf, xdp: Restructure redirect actions"). > Add it back as we need to use ri->map again. > > Here is the performance result by using 10Gb i40e NIC, do XDP_DROP on > veth peer, run xdp_redirect_{map, map_multi} in sample/bpf and send pkts > via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are some drop back as we need to loop the map and get each interface. > > Version | Test | Generic | Native > 5.12 rc4 | redirect_map i40e->i40e | 1.9M | 9.6M > 5.12 rc4 | redirect_map i40e->veth | 1.7M | 11.7M > 5.12 rc4 + patch | redirect_map i40e->i40e | 1.9M | 9.3M > 5.12 rc4 + patch | redirect_map i40e->veth | 1.7M | 11.4M > 5.12 rc4 + patch | redirect_map multi i40e->i40e | 1.9M | 8.9M > 5.12 rc4 + patch | redirect_map multi i40e->veth | 1.7M | 10.9M > 5.12 rc4 + patch | redirect_map multi i40e->mlx4+veth | 1.2M | 3.8M > > Signed-off-by: Hangbin Liu <liuhangbin@xxxxxxxxx> Mostly looking good, but found a memory leak in the error path for generic XDP :( See below... > --- > v5: > a) use xchg() instead of READ/WRITE_ONCE and no need to clear ri->flags > in xdp_do_redirect() > b) Do not use get_next_key() as we may restart looping from the first key > when remove/update a dev in hash map. Just walk the map directly to > get all the devices and ignore the new added/deleted objects. > c) Loop all the array map instead stop at the first hole. > > v4: > a) add a new argument flag_mask to __bpf_xdp_redirect_map() filter out > invalid map. > b) __bpf_xdp_redirect_map() sets the map pointer if the broadcast flag > is set and clears it if the flag isn't set > c) xdp_do_redirect() does the READ_ONCE/WRITE_ONCE on ri->map to check > if we should enqueue multi > > v3: > a) Rebase the code on Björn's "bpf, xdp: Restructure redirect actions". > - Add struct bpf_map *map back to struct bpf_redirect_info as we need > it for multicast. > - Add bpf_clear_redirect_map() back for devmap.c > - Add devmap_lookup_elem() as we need it in general path. > b) remove tmp_key in devmap_get_next_obj() > > v2: Fix flag renaming issue in v1 > --- > include/linux/bpf.h | 20 ++++ > include/linux/filter.h | 18 +++- > include/net/xdp.h | 1 + > include/uapi/linux/bpf.h | 17 +++- > kernel/bpf/cpumap.c | 3 +- > kernel/bpf/devmap.c | 172 ++++++++++++++++++++++++++++++++- > net/core/filter.c | 33 ++++++- > net/core/xdp.c | 29 ++++++ > net/xdp/xskmap.c | 3 +- > tools/include/uapi/linux/bpf.h | 17 +++- > 10 files changed, 299 insertions(+), 14 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index ff8cd68c01b3..ab6bde1f3b91 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1496,8 +1496,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > struct net_device *dev_rx); > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > struct net_device *dev_rx); > +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, > + struct bpf_map *map, bool exclude_ingress); > int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, > struct bpf_prog *xdp_prog); > +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, > + struct bpf_prog *xdp_prog, struct bpf_map *map, > + bool exclude_ingress); > bool dev_map_can_have_prog(struct bpf_map *map); > > void __cpu_map_flush(void); > @@ -1665,6 +1670,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > return 0; > } > > +static inline > +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, > + struct bpf_map *map, bool exclude_ingress) > +{ > + return 0; > +} > + > struct sk_buff; > > static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, > @@ -1674,6 +1686,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, > return 0; > } > > +static inline > +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, > + struct bpf_prog *xdp_prog, struct bpf_map *map, > + bool exclude_ingress) > +{ > + return 0; > +} > + > static inline void __cpu_map_flush(void) > { > } > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 9a09547bc7ba..e4885b42d754 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -646,6 +646,7 @@ struct bpf_redirect_info { > u32 flags; > u32 tgt_index; > void *tgt_value; > + struct bpf_map *map; > u32 map_id; > enum bpf_map_type map_type; > u32 kern_flags; > @@ -1464,17 +1465,18 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, > } > #endif /* IS_ENABLED(CONFIG_IPV6) */ > > -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, > +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, > + u64 flags, u64 flag_mask, > void *lookup_elem(struct bpf_map *map, u32 key)) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > > /* Lower bits of the flags are used as return code on lookup failure */ > - if (unlikely(flags > XDP_TX)) > + if (unlikely(flags & ~(BPF_F_ACTION_MASK | flag_mask))) > return XDP_ABORTED; > > ri->tgt_value = lookup_elem(map, ifindex); > - if (unlikely(!ri->tgt_value)) { > + if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { > /* If the lookup fails we want to clear out the state in the > * redirect_info struct completely, so that if an eBPF program > * performs multiple lookups, the last one always takes > @@ -1482,13 +1484,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind > */ > ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ > ri->map_type = BPF_MAP_TYPE_UNSPEC; > - return flags; > + return flags & BPF_F_ACTION_MASK; > } > > ri->tgt_index = ifindex; > ri->map_id = map->id; > ri->map_type = map->map_type; > > + if (flags & BPF_F_BROADCAST) { > + WRITE_ONCE(ri->map, map); > + ri->flags = flags; > + } else { > + WRITE_ONCE(ri->map, NULL); > + ri->flags = 0; > + } > + > return XDP_REDIRECT; > } > > diff --git a/include/net/xdp.h b/include/net/xdp.h > index a5bc214a49d9..5533f0ab2afc 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, > struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, > struct net_device *dev); > int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); > +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); > > static inline > void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 85c924bc21b1..b178f5b0d3f4 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2534,8 +2534,12 @@ union bpf_attr { > * The lower two bits of *flags* are used as the return code if > * the map lookup fails. This is so that the return value can be > * one of the XDP program return codes up to **XDP_TX**, as chosen > - * by the caller. Any higher bits in the *flags* argument must be > - * unset. > + * by the caller. The higher bits of *flags* can be set to > + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. > + * > + * With BPF_F_BROADCAST the packet will be broadcasted to all the > + * interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress > + * interface will be excluded when do broadcasting. > * > * See also **bpf_redirect**\ (), which only supports redirecting > * to an ifindex, but doesn't require a map to do so. > @@ -5052,6 +5056,15 @@ enum { > BPF_F_BPRM_SECUREEXEC = (1ULL << 0), > }; > > +/* Flags for bpf_redirect_map helper */ > +enum { > + BPF_F_BROADCAST = (1ULL << 3), > + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), > +}; > + > +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX) > +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) > + > #define __bpf_md_ptr(type, name) \ > union { \ > type name; \ > diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c > index 0cf2791d5099..2c33a7a09783 100644 > --- a/kernel/bpf/cpumap.c > +++ b/kernel/bpf/cpumap.c > @@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) > > static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) > { > - return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); > + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, > + __cpu_map_lookup_elem); > } > > static int cpu_map_btf_id; > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 3980fb3bfb09..599a96c9d2c0 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map) > list_del_rcu(&dtab->list); > spin_unlock(&dev_map_lock); > > + bpf_clear_redirect_map(map); > synchronize_rcu(); > > /* Make sure prior __dev_map_entry_free() have completed. */ > @@ -515,6 +516,101 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); > } > > +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp, > + int exclude_ifindex) > +{ > + if (!obj || obj->dev->ifindex == exclude_ifindex || > + !obj->dev->netdev_ops->ndo_xdp_xmit) > + return false; > + > + if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) > + return false; > + > + return true; > +} > + > +static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, > + struct net_device *dev_rx, > + struct xdp_frame *xdpf) > +{ > + struct xdp_frame *nxdpf; > + > + nxdpf = xdpf_clone(xdpf); > + if (unlikely(!nxdpf)) { > + xdp_return_frame_rx_napi(xdpf); > + return -ENOMEM; > + } > + > + bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); > + > + return 0; > +} > + > +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, > + struct bpf_map *map, bool exclude_ingress) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0; > + struct bpf_dtab_netdev *dst, *last_dst = NULL; > + struct hlist_head *head; > + struct hlist_node *next; > + struct xdp_frame *xdpf; > + unsigned int i; > + int err; > + > + xdpf = xdp_convert_buff_to_frame(xdp); > + if (unlikely(!xdpf)) > + return -ENOSPC; > + > + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { > + for (i = 0; i < map->max_entries; i++) { > + dst = READ_ONCE(dtab->netdev_map[i]); > + if (!is_valid_dst(dst, xdp, exclude_ifindex)) > + continue; > + > + /* we only need n-1 clones; last_dst enqueued below */ > + if (!last_dst) { > + last_dst = dst; > + continue; > + } > + > + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); > + if (err) > + return err; > + > + last_dst = dst; > + } > + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ > + for (i = 0; i < dtab->n_buckets; i++) { > + head = dev_map_index_hash(dtab, i); > + hlist_for_each_entry_safe(dst, next, head, index_hlist) { > + if (!is_valid_dst(dst, xdp, exclude_ifindex)) > + continue; > + > + /* we only need n-1 clones; last_dst enqueued below */ > + if (!last_dst) { > + last_dst = dst; > + continue; > + } > + > + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); > + if (err) > + return err; > + > + last_dst = dst; > + } > + } > + } > + > + /* consume the last copy of the frame */ > + if (last_dst) > + bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); > + else > + xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ > + > + return 0; > +} > + > int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, > struct bpf_prog *xdp_prog) > { > @@ -529,6 +625,76 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, > return 0; > } > > +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, > + struct bpf_prog *xdp_prog, struct bpf_map *map, > + bool exclude_ingress) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + int exclude_ifindex = exclude_ingress ? dev->ifindex : 0; > + struct bpf_dtab_netdev *dst, *last_dst = NULL; > + struct hlist_head *head; > + struct hlist_node *next; > + struct sk_buff *nskb; > + unsigned int i; > + int err; > + > + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { > + for (i = 0; i < map->max_entries; i++) { > + dst = READ_ONCE(dtab->netdev_map[i]); > + if (!dst || dst->dev->ifindex == exclude_ifindex) > + continue; > + > + /* we only need n-1 clones; last_dst enqueued below */ > + if (!last_dst) { > + last_dst = dst; > + continue; > + } > + > + nskb = skb_clone(skb, GFP_ATOMIC); > + if (!nskb) > + return -ENOMEM; > + > + err = dev_map_generic_redirect(last_dst, nskb, xdp_prog); > + if (err) > + return err; In dev_map_enqueue_multi() you're using a helper that makes sure to free the original frame before returning an error, but here you're open-coding it, which means that these two error returns will leak the original skb. Maybe introduce a similar dev_map_redirect_clone() helper that also frees the skb on error? That would make the two functions more similar as well (and hopefully make any future consolidation easier). > + last_dst = dst; > + } > + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ > + for (i = 0; i < dtab->n_buckets; i++) { > + head = dev_map_index_hash(dtab, i); > + hlist_for_each_entry_safe(dst, next, head, index_hlist) { > + if (!dst || dst->dev->ifindex == exclude_ifindex) > + continue; > + > + /* we only need n-1 clones; last_dst enqueued below */ > + if (!last_dst) { > + last_dst = dst; > + continue; > + } > + > + nskb = skb_clone(skb, GFP_ATOMIC); > + if (!nskb) > + return -ENOMEM; > + > + err = dev_map_generic_redirect(last_dst, nskb, xdp_prog); > + if (err) > + return err; Same here, of course... -Toke