Re: [PATCHv5 bpf-next 2/4] xdp: extend xdp_redirect_map with broadcast support

Toke Høiland-Jørgensen <toke@xxxxxxxxxx> · Tue, 13 Apr 2021 17:38:51 +0200

Hangbin Liu <liuhangbin@xxxxxxxxx> writes:

> This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to
> extend xdp_redirect_map for broadcast support.
>
> With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces
> in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be
> excluded when do broadcasting.
>
> When getting the devices in dev hash map via dev_map_hash_get_next_key(),
> there is a possibility that we fall back to the first key when a device
> was removed. This will duplicate packets on some interfaces. So just walk
> the whole buckets to avoid this issue. For dev array map, we also walk the
> whole map to find valid interfaces.
>
> Function bpf_clear_redirect_map() was removed in
> commit ee75aef23afe ("bpf, xdp: Restructure redirect actions").
> Add it back as we need to use ri->map again.
>
> Here is the performance result by using 10Gb i40e NIC, do XDP_DROP on
> veth peer, run xdp_redirect_{map, map_multi} in sample/bpf and send pkts
> via pktgen cmd:
> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
>
> There are some drop back as we need to loop the map and get each interface.
>
> Version          | Test                                | Generic | Native
> 5.12 rc4         | redirect_map        i40e->i40e      |    1.9M |  9.6M
> 5.12 rc4         | redirect_map        i40e->veth      |    1.7M | 11.7M
> 5.12 rc4 + patch | redirect_map        i40e->i40e      |    1.9M |  9.3M
> 5.12 rc4 + patch | redirect_map        i40e->veth      |    1.7M | 11.4M
> 5.12 rc4 + patch | redirect_map multi  i40e->i40e      |    1.9M |  8.9M
> 5.12 rc4 + patch | redirect_map multi  i40e->veth      |    1.7M | 10.9M
> 5.12 rc4 + patch | redirect_map multi  i40e->mlx4+veth |    1.2M |  3.8M
>
> Signed-off-by: Hangbin Liu <liuhangbin@xxxxxxxxx>

Mostly looking good, but found a memory leak in the error path for generic XDP :(

See below...

> ---
> v5:
> a) use xchg() instead of READ/WRITE_ONCE and no need to clear ri->flags
>    in xdp_do_redirect()
> b) Do not use get_next_key() as we may restart looping from the first key
>    when remove/update a dev in hash map. Just walk the map directly to
>    get all the devices and ignore the new added/deleted objects.
> c) Loop all the array map instead stop at the first hole.
>
> v4:
> a) add a new argument flag_mask to __bpf_xdp_redirect_map() filter out
> invalid map.
> b) __bpf_xdp_redirect_map() sets the map pointer if the broadcast flag
> is set and clears it if the flag isn't set
> c) xdp_do_redirect() does the READ_ONCE/WRITE_ONCE on ri->map to check
> if we should enqueue multi
>
> v3:
> a) Rebase the code on Björn's "bpf, xdp: Restructure redirect actions".
>    - Add struct bpf_map *map back to struct bpf_redirect_info as we need
>      it for multicast.
>    - Add bpf_clear_redirect_map() back for devmap.c
>    - Add devmap_lookup_elem() as we need it in general path.
> b) remove tmp_key in devmap_get_next_obj()
>
> v2: Fix flag renaming issue in v1
> ---
>  include/linux/bpf.h            |  20 ++++
>  include/linux/filter.h         |  18 +++-
>  include/net/xdp.h              |   1 +
>  include/uapi/linux/bpf.h       |  17 +++-
>  kernel/bpf/cpumap.c            |   3 +-
>  kernel/bpf/devmap.c            | 172 ++++++++++++++++++++++++++++++++-
>  net/core/filter.c              |  33 ++++++-
>  net/core/xdp.c                 |  29 ++++++
>  net/xdp/xskmap.c               |   3 +-
>  tools/include/uapi/linux/bpf.h |  17 +++-
>  10 files changed, 299 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ff8cd68c01b3..ab6bde1f3b91 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1496,8 +1496,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx);
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress);
>  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
>  			     struct bpf_prog *xdp_prog);
> +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
> +			   struct bpf_prog *xdp_prog, struct bpf_map *map,
> +			   bool exclude_ingress);
>  bool dev_map_can_have_prog(struct bpf_map *map);
>  
>  void __cpu_map_flush(void);
> @@ -1665,6 +1670,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  	return 0;
>  }
>  
> +static inline
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress)
> +{
> +	return 0;
> +}
> +
>  struct sk_buff;
>  
>  static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
> @@ -1674,6 +1686,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
>  	return 0;
>  }
>  
> +static inline
> +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
> +			   struct bpf_prog *xdp_prog, struct bpf_map *map,
> +			   bool exclude_ingress)
> +{
> +	return 0;
> +}
> +
>  static inline void __cpu_map_flush(void)
>  {
>  }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 9a09547bc7ba..e4885b42d754 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -646,6 +646,7 @@ struct bpf_redirect_info {
>  	u32 flags;
>  	u32 tgt_index;
>  	void *tgt_value;
> +	struct bpf_map *map;
>  	u32 map_id;
>  	enum bpf_map_type map_type;
>  	u32 kern_flags;
> @@ -1464,17 +1465,18 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
>  }
>  #endif /* IS_ENABLED(CONFIG_IPV6) */
>  
> -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
> +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
> +						  u64 flags, u64 flag_mask,
>  						  void *lookup_elem(struct bpf_map *map, u32 key))
>  {
>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>  
>  	/* Lower bits of the flags are used as return code on lookup failure */
> -	if (unlikely(flags > XDP_TX))
> +	if (unlikely(flags & ~(BPF_F_ACTION_MASK | flag_mask)))
>  		return XDP_ABORTED;
>  
>  	ri->tgt_value = lookup_elem(map, ifindex);
> -	if (unlikely(!ri->tgt_value)) {
> +	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
>  		/* If the lookup fails we want to clear out the state in the
>  		 * redirect_info struct completely, so that if an eBPF program
>  		 * performs multiple lookups, the last one always takes
> @@ -1482,13 +1484,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
>  		 */
>  		ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
>  		ri->map_type = BPF_MAP_TYPE_UNSPEC;
> -		return flags;
> +		return flags & BPF_F_ACTION_MASK;
>  	}
>  
>  	ri->tgt_index = ifindex;
>  	ri->map_id = map->id;
>  	ri->map_type = map->map_type;
>  
> +	if (flags & BPF_F_BROADCAST) {
> +		WRITE_ONCE(ri->map, map);
> +		ri->flags = flags;
> +	} else {
> +		WRITE_ONCE(ri->map, NULL);
> +		ri->flags = 0;
> +	}
> +
>  	return XDP_REDIRECT;
>  }
>  
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index a5bc214a49d9..5533f0ab2afc 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
>  struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
>  					 struct net_device *dev);
>  int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
> +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
>  
>  static inline
>  void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 85c924bc21b1..b178f5b0d3f4 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2534,8 +2534,12 @@ union bpf_attr {
>   * 		The lower two bits of *flags* are used as the return code if
>   * 		the map lookup fails. This is so that the return value can be
>   * 		one of the XDP program return codes up to **XDP_TX**, as chosen
> - * 		by the caller. Any higher bits in the *flags* argument must be
> - * 		unset.
> + * 		by the caller. The higher bits of *flags* can be set to
> + * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
> + *
> + * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
> + * 		interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress
> + * 		interface will be excluded when do broadcasting.
>   *
>   * 		See also **bpf_redirect**\ (), which only supports redirecting
>   * 		to an ifindex, but doesn't require a map to do so.
> @@ -5052,6 +5056,15 @@ enum {
>  	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
>  };
>  
> +/* Flags for bpf_redirect_map helper */
> +enum {
> +	BPF_F_BROADCAST		= (1ULL << 3),
> +	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
> +};
> +
> +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
> +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
> +
>  #define __bpf_md_ptr(type, name)	\
>  union {					\
>  	type name;			\
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 0cf2791d5099..2c33a7a09783 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  
>  static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
>  {
> -	return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
> +	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
> +				      __cpu_map_lookup_elem);
>  }
>  
>  static int cpu_map_btf_id;
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 3980fb3bfb09..599a96c9d2c0 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
>  	list_del_rcu(&dtab->list);
>  	spin_unlock(&dev_map_lock);
>  
> +	bpf_clear_redirect_map(map);
>  	synchronize_rcu();
>  
>  	/* Make sure prior __dev_map_entry_free() have completed. */
> @@ -515,6 +516,101 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
>  }
>  
> +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
> +			 int exclude_ifindex)
> +{
> +	if (!obj || obj->dev->ifindex == exclude_ifindex ||
> +	    !obj->dev->netdev_ops->ndo_xdp_xmit)
> +		return false;
> +
> +	if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
> +		return false;
> +
> +	return true;
> +}
> +
> +static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
> +				 struct net_device *dev_rx,
> +				 struct xdp_frame *xdpf)
> +{
> +	struct xdp_frame *nxdpf;
> +
> +	nxdpf = xdpf_clone(xdpf);
> +	if (unlikely(!nxdpf)) {
> +		xdp_return_frame_rx_napi(xdpf);
> +		return -ENOMEM;
> +	}
> +
> +	bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
> +
> +	return 0;
> +}
> +
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
> +	struct bpf_dtab_netdev *dst, *last_dst = NULL;
> +	struct hlist_head *head;
> +	struct hlist_node *next;
> +	struct xdp_frame *xdpf;
> +	unsigned int i;
> +	int err;
> +
> +	xdpf = xdp_convert_buff_to_frame(xdp);
> +	if (unlikely(!xdpf))
> +		return -ENOSPC;
> +
> +	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> +		for (i = 0; i < map->max_entries; i++) {
> +			dst = READ_ONCE(dtab->netdev_map[i]);
> +			if (!is_valid_dst(dst, xdp, exclude_ifindex))
> +				continue;
> +
> +			/* we only need n-1 clones; last_dst enqueued below */
> +			if (!last_dst) {
> +				last_dst = dst;
> +				continue;
> +			}
> +
> +			err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
> +			if (err)
> +				return err;
> +
> +			last_dst = dst;
> +		}
> +	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
> +		for (i = 0; i < dtab->n_buckets; i++) {
> +			head = dev_map_index_hash(dtab, i);
> +			hlist_for_each_entry_safe(dst, next, head, index_hlist) {
> +				if (!is_valid_dst(dst, xdp, exclude_ifindex))
> +					continue;
> +
> +				/* we only need n-1 clones; last_dst enqueued below */
> +				if (!last_dst) {
> +					last_dst = dst;
> +					continue;
> +				}
> +
> +				err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
> +				if (err)
> +					return err;
> +
> +				last_dst = dst;
> +			}
> +		}
> +	}
> +
> +	/* consume the last copy of the frame */
> +	if (last_dst)
> +		bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
> +	else
> +		xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
> +
> +	return 0;
> +}
> +
>  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
>  			     struct bpf_prog *xdp_prog)
>  {
> @@ -529,6 +625,76 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
>  	return 0;
>  }
>  
> +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
> +			   struct bpf_prog *xdp_prog, struct bpf_map *map,
> +			   bool exclude_ingress)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
> +	struct bpf_dtab_netdev *dst, *last_dst = NULL;
> +	struct hlist_head *head;
> +	struct hlist_node *next;
> +	struct sk_buff *nskb;
> +	unsigned int i;
> +	int err;
> +
> +	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> +		for (i = 0; i < map->max_entries; i++) {
> +			dst = READ_ONCE(dtab->netdev_map[i]);
> +			if (!dst || dst->dev->ifindex == exclude_ifindex)
> +				continue;
> +
> +			/* we only need n-1 clones; last_dst enqueued below */
> +			if (!last_dst) {
> +				last_dst = dst;
> +				continue;
> +			}
> +
> +			nskb = skb_clone(skb, GFP_ATOMIC);
> +			if (!nskb)
> +				return -ENOMEM;
> +
> +			err = dev_map_generic_redirect(last_dst, nskb, xdp_prog);
> +			if (err)
> +				return err;

In dev_map_enqueue_multi() you're using a helper that makes sure to free
the original frame before returning an error, but here you're
open-coding it, which means that these two error returns will leak the
original skb.

Maybe introduce a similar dev_map_redirect_clone() helper that also
frees the skb on error? That would make the two functions more similar
as well (and hopefully make any future consolidation easier).

> +			last_dst = dst;
> +		}
> +	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
> +		for (i = 0; i < dtab->n_buckets; i++) {
> +			head = dev_map_index_hash(dtab, i);
> +			hlist_for_each_entry_safe(dst, next, head, index_hlist) {
> +				if (!dst || dst->dev->ifindex == exclude_ifindex)
> +					continue;
> +
> +				/* we only need n-1 clones; last_dst enqueued below */
> +				if (!last_dst) {
> +					last_dst = dst;
> +					continue;
> +				}
> +
> +				nskb = skb_clone(skb, GFP_ATOMIC);
> +				if (!nskb)
> +					return -ENOMEM;
> +
> +				err = dev_map_generic_redirect(last_dst, nskb, xdp_prog);
> +				if (err)
> +					return err;

Same here, of course...

-Toke