Re: [PATCH] bpf: devmap: allow for repeated redirect

Toke Høiland-Jørgensen <toke@xxxxxxxxxx> · Tue, 10 Sep 2024 10:34:31 +0200

Florian Kauer <florian.kauer@xxxxxxxxxxxxx> writes:

> Currently, returning XDP_REDIRECT from a xdp/devmap program
> is considered as invalid action and an exception is traced.
>
> While it might seem counterintuitive to redirect in a xdp/devmap
> program (why not just redirect to the correct interface in the
> first program?), we faced several use cases where supporting
> this would be very useful.
>
> Most importantly, they occur when the first redirect is used
> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
> programs, enables to perform different actions on clones of
> the same incoming frame. In that case, it is often useful
> to redirect either to a different CPU or device AFTER the cloning.
>
> For example:
> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>   and then use the second redirect with a cpumap to select
>   the path-specific egress queue.
>
> - Also, one of the paths might need an encapsulation that
>   exceeds the MTU. So a second redirect can be used back
>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>
> - For OAM purposes, you might want to send one frame with
>   OAM information back, while the original frame in passed forward.
>
> To enable these use cases, add the XDP_REDIRECT case to
> dev_map_bpf_prog_run. Also, when this is called from inside
> xdp_do_flush, the redirect might add further entries to the
> flush lists that are currently processed. Therefore, loop inside
> xdp_do_flush until no more additional redirects were added.
>
> Signed-off-by: Florian Kauer <florian.kauer@xxxxxxxxxxxxx>

This is an interesting use case! However, your implementation makes it
way to easy to end up in a situation that loops forever, so I think we
should add some protection against that. Some details below:

> ---
>  include/linux/bpf.h        |  4 ++--
>  include/trace/events/xdp.h | 10 ++++++----
>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>  4 files changed, 65 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 3b94ec161e8c..1b57cbabf199 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2498,7 +2498,7 @@ struct sk_buff;
>  struct bpf_dtab_netdev;
>  struct bpf_cpu_map_entry;
>  
> -void __dev_flush(struct list_head *flush_list);
> +void __dev_flush(struct list_head *flush_list, int *redirects);
>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>  		    struct net_device *dev_rx);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>  	return ERR_PTR(-EOPNOTSUPP);
>  }
>  
> -static inline void __dev_flush(struct list_head *flush_list)
> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>  {
>  }
>  
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index a7e5452b5d21..fba2c457e727 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>  
>  	TP_PROTO(const struct net_device *from_dev,
>  		 const struct net_device *to_dev,
> -		 int sent, int drops, int err),
> +		 int sent, int drops, int redirects, int err),
>  
> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>  
>  	TP_STRUCT__entry(
>  		__field(int, from_ifindex)
> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>  		__field(int, to_ifindex)
>  		__field(int, drops)
>  		__field(int, sent)
> +		__field(int, redirects)
>  		__field(int, err)
>  	),
>  
> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>  		__entry->to_ifindex	= to_dev->ifindex;
>  		__entry->drops		= drops;
>  		__entry->sent		= sent;
> +		__entry->redirects	= redirects;
>  		__entry->err		= err;
>  	),
>  
>  	TP_printk("ndo_xdp_xmit"
>  		  " from_ifindex=%d to_ifindex=%d action=%s"
> -		  " sent=%d drops=%d"
> +		  " sent=%d drops=%d redirects=%d"
>  		  " err=%d",
>  		  __entry->from_ifindex, __entry->to_ifindex,
>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
> -		  __entry->sent, __entry->drops,
> +		  __entry->sent, __entry->drops, __entry->redirects,
>  		  __entry->err)
>  );
>  
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 7878be18e9d2..89bdec49ea40 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  				struct xdp_frame **frames, int n,
>  				struct net_device *tx_dev,
> -				struct net_device *rx_dev)
> +				struct net_device *rx_dev,
> +				int *redirects)
>  {
>  	struct xdp_txq_info txq = { .dev = tx_dev };
>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  			else
>  				frames[nframes++] = xdpf;
>  			break;
> +		case XDP_REDIRECT:
> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
> +			if (unlikely(err))
> +				xdp_return_frame_rx_napi(xdpf);
> +			else
> +				*redirects += 1;
> +			break;

It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
frames in the queue in-place (the frames[nframes++] = xdpf; line above),
which only works under the assumption that the array in bq->q is not
modified while this loop is being run. But now you're adding a call in
the middle that may result in the packet being put back on the same
queue in the middle, which means that this assumption no longer holds.

So you need to clear the bq->q queue first for this to work.
Specifically, at the start of bq_xmit_all(), you'll need to first copy
all the packet pointer onto an on-stack array, then run the rest of the
function on that array. There's already an initial loop that goes
through all the frames, so you can just do it there.

So the loop at the start of bq_xmit_all() goes from the current:

	for (i = 0; i < cnt; i++) {
		struct xdp_frame *xdpf = bq->q[i];

		prefetch(xdpf);
	}

to something like:

        struct xdp_frame *frames[DEV_MAP_BULK_SIZE];

	for (i = 0; i < cnt; i++) {
		struct xdp_frame *xdpf = bq->q[i];

		prefetch(xdpf);
                frames[i] = xdpf;
	}

        bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
                          stack variables for the rest of the function */

>  		default:
>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>  			fallthrough;
> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  	return nframes; /* sent frames count */
>  }
>  
> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>  {
>  	struct net_device *dev = bq->dev;
>  	unsigned int cnt = bq->count;
> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>  		prefetch(xdpf);
>  	}
>  
> +	int new_redirects = 0;
>  	if (bq->xdp_prog) {
> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
> +				&new_redirects);
>  		if (!to_send)
>  			goto out;
>  	}
> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>  
>  out:
>  	bq->count = 0;
> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
> +	*redirects += new_redirects;
> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
> +			new_redirects, err);
>  }
>  
>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>   * driver before returning from its napi->poll() routine. See the comment above
>   * xdp_do_flush() in filter.c.
>   */
> -void __dev_flush(struct list_head *flush_list)
> +void __dev_flush(struct list_head *flush_list, int *redirects)
>  {
>  	struct xdp_dev_bulk_queue *bq, *tmp;
>  
>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>  		bq->dev_rx = NULL;
>  		bq->xdp_prog = NULL;
>  		__list_del_clearprev(&bq->flush_node);
> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>  {
>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>  
> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> -		bq_xmit_all(bq, 0);
> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
> +		int redirects = 0;
> +
> +		bq_xmit_all(bq, 0, &redirects);
> +
> +		/* according to comment above xdp_do_flush() in
> +		 * filter.c, xdp_do_flush is always called at
> +		 * the end of the NAPI anyway, so no need to act
> +		 * on the redirects here
> +		 */

While it's true that it will be called again in NAPI, the purpose of
calling bq_xmit_all() here is to make room space for the packet on the
bulk queue that we're about to enqueue, and if bq_xmit_all() can just
put the packet back on the queue, there is no guarantee that this will
succeed. So you will have to handle that case here.

Since there's also a potential infinite recursion issue in the
do_flush() functions below, I think it may be better to handle this
looping issue inside bq_xmit_all().

I.e., structure the code so that bq_xmit_all() guarantees that when it
returns it has actually done its job; that is, that bq->q is empty.

Given the above "move all frames out of bq->q at the start" change, this
is not all that hard. Simply add a check after the out: label (in
bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
start over from the beginning of that function. This also makes it
straight forward to add a recursion limit; after looping a set number of
times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.

There will need to be some additional protection against looping forever
in __dev_flush(), to handle the case where a packet is looped between
two interfaces. This one is a bit trickier, but a similar recursion
counter could be used, I think.

In any case, this needs extensive selftests, including ones with devmap
programs that loop packets (both redirect from a->a, and from
a->b->a->b) to make sure the limits work correctly.

> +	}
>  
>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>  	 * bulk_queue, because bq stored per-CPU and must be flushed
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8569cd2482ee..b33fc0b1444a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>  void xdp_do_flush(void)
>  {
>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
> +	int redirect;
>  
> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> -	if (lh_dev)
> -		__dev_flush(lh_dev);
> -	if (lh_map)
> -		__cpu_map_flush(lh_map);
> -	if (lh_xsk)
> -		__xsk_map_flush(lh_xsk);
> +	do {
> +		redirect = 0;
> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> +		if (lh_dev)
> +			__dev_flush(lh_dev, &redirect);
> +		if (lh_map)
> +			__cpu_map_flush(lh_map);
> +		if (lh_xsk)
> +			__xsk_map_flush(lh_xsk);
> +	} while (redirect > 0);
>  }
>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>  
> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>  {
>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>  	bool missed = false;
> +	int redirect;
>  
> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> -	if (lh_dev) {
> -		__dev_flush(lh_dev);
> -		missed = true;
> -	}
> -	if (lh_map) {
> -		__cpu_map_flush(lh_map);
> -		missed = true;
> -	}
> -	if (lh_xsk) {
> -		__xsk_map_flush(lh_xsk);
> -		missed = true;
> -	}
> +	do {
> +		redirect = 0;
> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> +		if (lh_dev) {
> +			__dev_flush(lh_dev, &redirect);
> +			missed = true;
> +		}
> +		if (lh_map) {
> +			__cpu_map_flush(lh_map);
> +			missed = true;
> +		}
> +		if (lh_xsk) {
> +			__xsk_map_flush(lh_xsk);
> +			missed = true;
> +		}
> +	} while (redirect > 0);

With the change suggested above (so that bq_xmit_all() guarantees the
flush is completely done), this looping is not needed anymore. However,
it becomes important in which *order* the flushing is done
(__dev_flush() should always happen first), so adding a comment to note
this would be good.

-Toke