On Sun, Mar 19, 2023 at 08:56:54PM +0100, Kal Conley wrote: > Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This > enables sending/receiving jumbo ethernet frames up to the theoretical > maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required > to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only > XDP_COPY mode is usuable pending future driver work. > > For consistency, check for HugeTLB pages during UMEM registration. This > implies that hugepages are required for XDP_COPY mode despite DMA not > being used. This restriction is desirable since it ensures user software > can take advantage of future driver support. > > Even in HugeTLB mode, continue to do page accounting using order-0 > (4 KiB) pages. This minimizes the size of this change and reduces the > risk of impacting driver code. Taking full advantage of hugepages for > accounting should improve XDP performance in the general case. > > No significant change in RX/TX performance was observed with this patch. > A few data points are reproduced below: > > Machine : Dell PowerEdge R940 > CPU : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz > NIC : MT27700 Family [ConnectX-4] can you tell us a bit more about your environment setup - from what i can tell mlx4 does not support xdp multi-buffer, so you were testing this feature with XDP program attached in generic mode? if you were using xdpsock -S or xdpsock -c? or maybe even something different? what was your MTU size on NIC? > > +-----+------------+-------------+---------------+ > | | frame size | packet size | rxdrop (Mpps) | > +-----+------------+-------------+---------------+ > | old | 4000 | 320 | 15.7 | > | new | 4000 | 320 | 15.8 | > +-----+------------+-------------+---------------+ > | old | 4096 | 320 | 16.4 | > | new | 4096 | 320 | 16.3 | > +-----+------------+-------------+---------------+ > | new | 9000 | 320 | 6.3 | > | new | 10240 | 9000 | 0.4 | > +-----+------------+-------------+---------------+ > > Signed-off-by: Kal Conley <kal.conley@xxxxxxxxxxx> > --- > include/net/xdp_sock.h | 1 + > include/net/xdp_sock_drv.h | 6 +++++ > include/net/xsk_buff_pool.h | 4 +++- > net/xdp/xdp_umem.c | 46 +++++++++++++++++++++++++++++-------- > net/xdp/xsk.c | 3 +++ > net/xdp/xsk_buff_pool.c | 16 +++++++++---- > 6 files changed, 61 insertions(+), 15 deletions(-) > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h > index 3057e1a4a11c..e562ac3f5295 100644 > --- a/include/net/xdp_sock.h > +++ b/include/net/xdp_sock.h > @@ -28,6 +28,7 @@ struct xdp_umem { > struct user_struct *user; > refcount_t users; > u8 flags; > + bool hugetlb; > bool zc; > struct page **pgs; > int id; > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h > index 9c0d860609ba..eb630d17f994 100644 > --- a/include/net/xdp_sock_drv.h > +++ b/include/net/xdp_sock_drv.h > @@ -12,6 +12,12 @@ > #define XDP_UMEM_MIN_CHUNK_SHIFT 11 > #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) > > +/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB). > + * Larger chunks are not guaranteed to fit in a single SKB. > + */ > +#define XDP_UMEM_MAX_CHUNK_SHIFT 16 > +#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT) > + > #ifdef CONFIG_XDP_SOCKETS > > void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h > index 3e952e569418..69e3970da092 100644 > --- a/include/net/xsk_buff_pool.h > +++ b/include/net/xsk_buff_pool.h > @@ -78,6 +78,7 @@ struct xsk_buff_pool { > u8 cached_need_wakeup; > bool uses_need_wakeup; > bool dma_need_sync; > + bool hugetlb; > bool unaligned; > void *addrs; > /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: > @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, > static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, > u64 addr, u32 len) > { > - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; > + bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE : > + (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; > > if (likely(!cross_pg)) > return false; > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c > index 02207e852d79..c96eefb9f5ae 100644 > --- a/net/xdp/xdp_umem.c > +++ b/net/xdp/xdp_umem.c > @@ -10,6 +10,7 @@ > #include <linux/uaccess.h> > #include <linux/slab.h> > #include <linux/bpf.h> > +#include <linux/hugetlb_inline.h> > #include <linux/mm.h> > #include <linux/netdevice.h> > #include <linux/rtnetlink.h> > @@ -19,6 +20,9 @@ > #include "xdp_umem.h" > #include "xsk_queue.h" > > +_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE); > +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE); > + > static DEFINE_IDA(umem_ida); > > static void xdp_umem_unpin_pages(struct xdp_umem *umem) > @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) > } > } > > -static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) > +/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise. > + * > + * The mmap_lock must be held by the caller. > + */ > +static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address) > +{ > + unsigned long end = address + umem->npgs * PAGE_SIZE; > + struct vm_area_struct *vma; > + struct vma_iterator vmi; > + > + vma_iter_init(&vmi, current->mm, address); > + for_each_vma_range(vmi, vma, end) { > + if (!is_vm_hugetlb_page(vma)) > + return false; > + } > + > + return true; > +} > + > +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb) > { > unsigned int gup_flags = FOLL_WRITE; > long npgs; > @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) > return -ENOMEM; > > mmap_read_lock(current->mm); > + > + umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address); > + if (hugetlb && !umem->hugetlb) { > + mmap_read_unlock(current->mm); > + err = -EINVAL; > + goto out_pgs; > + } > + > npgs = pin_user_pages(address, umem->npgs, > gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); > + > mmap_read_unlock(current->mm); > > if (npgs != umem->npgs) { > @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > { > bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; > u32 chunk_size = mr->chunk_size, headroom = mr->headroom; > + bool hugetlb = chunk_size > PAGE_SIZE; > u64 addr = mr->addr, size = mr->len; > u32 chunks_rem, npgs_rem; > u64 chunks, npgs; > int err; > > - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { > - /* Strictly speaking we could support this, if: > - * - huge pages, or* > - * - using an IOMMU, or > - * - making sure the memory area is consecutive > - * but for now, we simply say "computer says no". > - */ > + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE) > return -EINVAL; > - } > > if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) > return -EINVAL; > @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) > if (err) > return err; > > - err = xdp_umem_pin_pages(umem, (unsigned long)addr); > + err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb); > if (err) > goto out_account; > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > index 2ac58b282b5e..3899a2d235bb 100644 > --- a/net/xdp/xsk.c > +++ b/net/xdp/xsk.c > @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) > sock_wfree(skb); > } > > +/* Chunks must fit in the SKB `frags` array. */ > +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS); > + > static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, > struct xdp_desc *desc) > { > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c > index b2df1e0f8153..777e8a38a232 100644 > --- a/net/xdp/xsk_buff_pool.c > +++ b/net/xdp/xsk_buff_pool.c > @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, > pool->headroom = umem->headroom; > pool->chunk_size = umem->chunk_size; > pool->chunk_shift = ffs(umem->chunk_size) - 1; > + pool->hugetlb = umem->hugetlb; > pool->unaligned = unaligned; > pool->frame_len = umem->chunk_size - umem->headroom - > XDP_PACKET_HEADROOM; > @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) > } > EXPORT_SYMBOL(xp_dma_unmap); > > -static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) > +/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all > + * order-0 pages within a hugepage have the same contiguity value. > + */ > +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb) > { > + u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE; > + u32 n = page_size >> PAGE_SHIFT; > u32 i; > > - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) { > - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1]) > + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) { > + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n]) > dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; > else > dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; > } > + for (; i < dma_map->dma_pages_cnt; i++) > + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; > } > > static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) > @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, > } > > if (pool->unaligned) > - xp_check_dma_contiguity(dma_map); > + xp_check_dma_contiguity(dma_map, pool->hugetlb); > > err = xp_init_dma_info(pool, dma_map); > if (err) { > -- > 2.39.2 >