Currently, returning XDP_REDIRECT from a xdp/devmap program is considered as invalid action and an exception is traced. While it might seem counterintuitive to redirect in a xdp/devmap program (why not just redirect to the correct interface in the first program?), we faced several use cases where supporting this would be very useful. Most importantly, they occur when the first redirect is used with the BPF_F_BROADCAST flag. Using this together with xdp/devmap programs, enables to perform different actions on clones of the same incoming frame. In that case, it is often useful to redirect either to a different CPU or device AFTER the cloning. For example: - Replicate the frame (for redundancy according to IEEE 802.1CB FRER) and then use the second redirect with a cpumap to select the path-specific egress queue. - Also, one of the paths might need an encapsulation that exceeds the MTU. So a second redirect can be used back to the ingress interface to send an ICMP FRAG_NEEDED packet. - For OAM purposes, you might want to send one frame with OAM information back, while the original frame in passed forward. To enable these use cases, add the XDP_REDIRECT case to dev_map_bpf_prog_run. Also, when this is called from inside xdp_do_flush, the redirect might add further entries to the flush lists that are currently processed. Therefore, loop inside xdp_do_flush until no more additional redirects were added. Signed-off-by: Florian Kauer <florian.kauer@xxxxxxxxxxxxx> --- include/linux/bpf.h | 4 ++-- include/trace/events/xdp.h | 10 ++++++---- kernel/bpf/devmap.c | 37 +++++++++++++++++++++++++++-------- net/core/filter.c | 48 +++++++++++++++++++++++++++------------------- 4 files changed, 65 insertions(+), 34 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b94ec161e8c..1b57cbabf199 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2498,7 +2498,7 @@ struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; -void __dev_flush(struct list_head *flush_list); +void __dev_flush(struct list_head *flush_list, int *redirects); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline void __dev_flush(struct list_head *flush_list) +static inline void __dev_flush(struct list_head *flush_list, int *redirects) { } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index a7e5452b5d21..fba2c457e727 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, - int sent, int drops, int err), + int sent, int drops, int redirects, int err), - TP_ARGS(from_dev, to_dev, sent, drops, err), + TP_ARGS(from_dev, to_dev, sent, drops, redirects, err), TP_STRUCT__entry( __field(int, from_ifindex) @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit, __field(int, to_ifindex) __field(int, drops) __field(int, sent) + __field(int, redirects) __field(int, err) ), @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit, __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; + __entry->redirects = redirects; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" - " sent=%d drops=%d" + " sent=%d drops=%d redirects=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->sent, __entry->drops, + __entry->sent, __entry->drops, __entry->redirects, __entry->err) ); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7878be18e9d2..89bdec49ea40 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *tx_dev, - struct net_device *rx_dev) + struct net_device *rx_dev, + int *redirects) { struct xdp_txq_info txq = { .dev = tx_dev }; struct xdp_rxq_info rxq = { .dev = rx_dev }; @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, else frames[nframes++] = xdpf; break; + case XDP_REDIRECT: + err = xdp_do_redirect(rx_dev, &xdp, xdp_prog); + if (unlikely(err)) + xdp_return_frame_rx_napi(xdpf); + else + *redirects += 1; + break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, return nframes; /* sent frames count */ } -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) prefetch(xdpf); } + int new_redirects = 0; if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx, + &new_redirects); if (!to_send) goto out; } @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); + *redirects += new_redirects; + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects, + new_redirects, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ -void __dev_flush(struct list_head *flush_list) +void __dev_flush(struct list_head *flush_list, int *redirects) { struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_xmit_all(bq, XDP_XMIT_FLUSH); + bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, { struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); - if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0); + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) { + int redirects = 0; + + bq_xmit_all(bq, 0, &redirects); + + /* according to comment above xdp_do_flush() in + * filter.c, xdp_do_flush is always called at + * the end of the NAPI anyway, so no need to act + * on the redirects here + */ + } /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed diff --git a/net/core/filter.c b/net/core/filter.c index 8569cd2482ee..b33fc0b1444a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; + int redirect; - bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); - if (lh_dev) - __dev_flush(lh_dev); - if (lh_map) - __cpu_map_flush(lh_map); - if (lh_xsk) - __xsk_map_flush(lh_xsk); + do { + redirect = 0; + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev, &redirect); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); + } while (redirect > 0); } EXPORT_SYMBOL_GPL(xdp_do_flush); @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; + int redirect; - bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); - if (lh_dev) { - __dev_flush(lh_dev); - missed = true; - } - if (lh_map) { - __cpu_map_flush(lh_map); - missed = true; - } - if (lh_xsk) { - __xsk_map_flush(lh_xsk); - missed = true; - } + do { + redirect = 0; + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev, &redirect); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; + } + } while (redirect > 0); WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); --- base-commit: 8e69c96df771ab469cec278edb47009351de4da6 change-id: 20240909-devel-koalo-fix-redirect-684639694951 prerequisite-patch-id: 6928ae7741727e3b2ab4a8c4256b06a861040a01 Best regards, -- Florian Kauer <florian.kauer@xxxxxxxxxxxxx>