In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current rx/tx data packets. This feature is very important in many cases. So this patch allows AF_PACKET to obtain xsk packages. By default, AF_PACKET is based on ptype_base/ptype_all in dev.c to obtain data packets. But xsk is not suitable for calling these callbacks, because it may send the packet to other protocol stacks. So the method I used is to let AF_PACKET get the data packet from xsk alone. Signed-off-by: Xuan Zhuo <xuanzhuo@xxxxxxxxxxxxxxxxx> --- include/net/xdp_sock.h | 15 +++++ net/packet/af_packet.c | 35 +++++++++-- net/packet/internal.h | 7 +++ net/xdp/Makefile | 2 +- net/xdp/xsk.c | 9 +++ net/xdp/xsk_packet.c | 129 +++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_packet.h | 44 ++++++++++++++ 7 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 net/xdp/xsk_packet.c create mode 100644 net/xdp/xsk_packet.h diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9c0722c6d7ac..b0acf0293132 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,6 +17,11 @@ struct net_device; struct xsk_queue; struct xdp_buff; +struct xsk_packet { + struct list_head list; + struct packet_type *pt; +}; + struct xdp_umem { void *addrs; u64 size; @@ -79,6 +84,8 @@ struct xdp_sock { int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +void xsk_add_pack(struct xsk_packet *xpt); +void __xsk_remove_pack(struct xsk_packet *xpt); #else @@ -96,6 +103,14 @@ static inline void __xsk_map_flush(void) { } +void xsk_add_pack(struct xsk_packet *xpt) +{ +} + +void __xsk_remove_pack(struct xsk_packet *xpt) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 597d798ac0a5..2720b51d13a6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -303,10 +303,14 @@ static void __register_prot_hook(struct sock *sk) struct packet_sock *po = pkt_sk(sk); if (!po->running) { - if (po->fanout) + if (po->fanout) { __fanout_link(sk, po); - else + } else { dev_add_pack(&po->prot_hook); +#ifdef CONFIG_XDP_SOCKETS + xsk_add_pack(&po->xsk_pt); +#endif + } sock_hold(sk); po->running = 1; @@ -333,10 +337,14 @@ static void __unregister_prot_hook(struct sock *sk, bool sync) po->running = 0; - if (po->fanout) + if (po->fanout) { __fanout_unlink(sk, po); - else + } else { __dev_remove_pack(&po->prot_hook); +#ifdef CONFIG_XDP_SOCKETS + __xsk_remove_pack(&po->xsk_pt); +#endif + } __sock_put(sk); @@ -1483,8 +1491,12 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po) rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; - if (f->num_members == 1) + if (f->num_members == 1) { dev_add_pack(&f->prot_hook); +#ifdef CONFIG_XDP_SOCKETS + xsk_add_pack(&f->xsk_pt); +#endif + } spin_unlock(&f->lock); } @@ -1504,8 +1516,12 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; - if (f->num_members == 0) + if (f->num_members == 0) { __dev_remove_pack(&f->prot_hook); +#ifdef CONFIG_XDP_SOCKETS + __xsk_remove_pack(&po->xsk_pt); +#endif + } spin_unlock(&f->lock); } @@ -1737,6 +1753,10 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) match->prot_hook.af_packet_priv = match; match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; +#ifdef CONFIG_XDP_SOCKETS + match->xsk_pt.pt = &match->prot_hook; +#endif + list_add(&match->list, &fanout_list); } err = -EINVAL; @@ -3315,6 +3335,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; +#ifdef CONFIG_XDP_SOCKETS + po->xsk_pt.pt = &po->prot_hook; +#endif if (proto) { po->prot_hook.type = proto; diff --git a/net/packet/internal.h b/net/packet/internal.h index 48af35b1aed2..d224b926588a 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -3,6 +3,7 @@ #define __PACKET_INTERNAL_H__ #include <linux/refcount.h> +#include <net/xdp_sock.h> struct packet_mclist { struct packet_mclist *next; @@ -94,6 +95,9 @@ struct packet_fanout { spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; +#ifdef CONFIG_XDP_SOCKETS + struct xsk_packet xsk_pt; +#endif struct sock __rcu *arr[]; }; @@ -136,6 +140,9 @@ struct packet_sock { struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; +#ifdef CONFIG_XDP_SOCKETS + struct xsk_packet xsk_pt; +#endif atomic_t tp_drops ____cacheline_aligned_in_smp; }; diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 30cdc4315f42..bcac0591879b 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o xsk_packet.o obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..fc97e7f9e4cb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -28,6 +28,7 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#include "xsk_packet.h" #include "xsk.h" #define TX_BATCH_SIZE 32 @@ -156,6 +157,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) int err; addr = xp_get_handle(xskb); + xsk_rx_packet_deliver(xs, addr, len); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { xs->rx_queue_full++; @@ -347,6 +349,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; + xsk_tx_zc_packet_deliver(xs, desc); + xskq_cons_release(xs->tx); rcu_read_unlock(); return true; @@ -576,6 +580,8 @@ static int xsk_generic_xmit(struct sock *sk) } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + xsk_tx_packet_deliver(xs, &desc, skb); + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ @@ -1467,6 +1473,9 @@ static int __init xsk_init(void) for_each_possible_cpu(cpu) INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); + + INIT_LIST_HEAD(&xsk_pt); + return 0; out_pernet: diff --git a/net/xdp/xsk_packet.c b/net/xdp/xsk_packet.c new file mode 100644 index 000000000000..41005f214d6d --- /dev/null +++ b/net/xdp/xsk_packet.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets packet api + * + * Author: Xuan Zhuo <xuanzhuo.dxf@xxxxxxxxxxxxxxxxx> + */ + +#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> +#include "xsk.h" +#include "xsk_packet.h" + +struct list_head xsk_pt __read_mostly; +static DEFINE_SPINLOCK(pt_lock); + +static struct sk_buff *xsk_pt_alloc_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct sk_buff *skb; + void *buffer; + int err; + + skb = alloc_skb(desc->len, GFP_ATOMIC); + if (!skb) + return NULL; + + skb_put(skb, desc->len); + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + err = skb_store_bits(skb, 0, buffer, desc->len); + if (unlikely(err)) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static struct sk_buff *xsk_pt_get_skb(struct xdp_sock *xs, + struct xdp_desc *desc, + struct sk_buff *skb, + bool rx) +{ + struct net_device *dev = xs->dev; + + /* We must copy the data, because skb may exist for a long time + * on AF_PACKET. If the buffer of the xsk is used by skb, the + * release of xsk and the reuse of the buffer will be affected. + */ + if (!skb || (dev->priv_flags & IFF_TX_SKB_NO_LINEAR)) + skb = xsk_pt_alloc_skb(xs, desc); + else + skb = skb_clone(skb, GFP_ATOMIC); + + if (!skb) + return NULL; + + skb->protocol = eth_type_trans(skb, dev); + skb_reset_network_header(skb); + skb->transport_header = skb->network_header; + __net_timestamp(skb); + + if (!rx) + skb->pkt_type = PACKET_OUTGOING; + + return skb; +} + +void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb, + struct xdp_desc *desc, bool rx) +{ + struct packet_type *pt_prev = NULL; + struct packet_type *ptype; + struct xsk_packet *xpt; + + rcu_read_lock(); + list_for_each_entry_rcu(xpt, &xsk_pt, list) { + ptype = xpt->pt; + + if (!rx && ptype->ignore_outgoing) + continue; + + if (pt_prev) { + refcount_inc(&skb->users); + pt_prev->func(skb, skb->dev, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb = xsk_pt_get_skb(xs, desc, skb, rx); + if (unlikely(!skb)) + goto out_unlock; + + pt_prev = ptype; + } + + if (pt_prev) + pt_prev->func(skb, skb->dev, pt_prev, skb->dev); + +out_unlock: + rcu_read_unlock(); +} + +void xsk_add_pack(struct xsk_packet *xpt) +{ + if (xpt->pt->type != htons(ETH_P_ALL)) + return; + + spin_lock(&pt_lock); + list_add_rcu(&xpt->list, &xsk_pt); + spin_unlock(&pt_lock); +} + +void __xsk_remove_pack(struct xsk_packet *xpt) +{ + struct xsk_packet *xpt1; + + spin_lock(&pt_lock); + + list_for_each_entry(xpt1, &xsk_pt, list) { + if (xpt1 == xpt) { + list_del_rcu(&xpt1->list); + goto out; + } + } + + pr_warn("xsk_remove_pack: %p not found\n", xpt); +out: + spin_unlock(&pt_lock); +} diff --git a/net/xdp/xsk_packet.h b/net/xdp/xsk_packet.h new file mode 100644 index 000000000000..55d30fa8828b --- /dev/null +++ b/net/xdp/xsk_packet.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __XSK_PACKET_H__ +#define __XSK_PACKET_H__ +extern struct list_head xsk_pt __read_mostly; + +void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb, + struct xdp_desc *desc, bool rx); + +static inline void xsk_tx_packet_deliver(struct xdp_sock *xs, + struct xdp_desc *desc, + struct sk_buff *skb) +{ + if (likely(list_empty(&xsk_pt))) + return; + + local_bh_disable(); + __xsk_pt_deliver(xs, skb, desc, false); + local_bh_enable(); +} + +static inline void xsk_tx_zc_packet_deliver(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + if (likely(list_empty(&xsk_pt))) + return; + + __xsk_pt_deliver(xs, NULL, desc, false); +} + +static inline void xsk_rx_packet_deliver(struct xdp_sock *xs, u64 addr, u32 len) +{ + struct xdp_desc desc; + + if (likely(list_empty(&xsk_pt))) + return; + + desc.addr = addr; + desc.len = len; + + __xsk_pt_deliver(xs, NULL, &desc, true); +} + +#endif /* __XSK_PACKET_H__ */ -- 2.31.0