On Tue, Dec 17, 2024 at 4:38 PM David Wei <dw@xxxxxxxxxxx> wrote: > > From: Pavel Begunkov <asml.silence@xxxxxxxxx> > > Setup DMA mappings for the area into which we intend to receive data > later on. We know the device we want to attach to even before we get a > page pool and can pre-map in advance. All net_iov are synchronised for > device when allocated, see page_pool_mp_return_in_cache(). > > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > Signed-off-by: David Wei <dw@xxxxxxxxxxx> > --- > include/uapi/linux/netdev.h | 1 + > io_uring/zcrx.c | 320 ++++++++++++++++++++++++++++++++++++ > io_uring/zcrx.h | 4 + > 3 files changed, 325 insertions(+) > > diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h > index e4be227d3ad6..13d810a28ed6 100644 > --- a/include/uapi/linux/netdev.h > +++ b/include/uapi/linux/netdev.h > @@ -94,6 +94,7 @@ enum { > NETDEV_A_PAGE_POOL_INFLIGHT_MEM, > NETDEV_A_PAGE_POOL_DETACH_TIME, > NETDEV_A_PAGE_POOL_DMABUF, > + NETDEV_A_PAGE_POOL_IO_URING, > > __NETDEV_A_PAGE_POOL_MAX, > NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) > diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c > index e6cca6747148..42098bc1a60f 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c > @@ -1,11 +1,18 @@ > // SPDX-License-Identifier: GPL-2.0 > #include <linux/kernel.h> > #include <linux/errno.h> > +#include <linux/dma-map-ops.h> > #include <linux/mm.h> > +#include <linux/nospec.h> > #include <linux/io_uring.h> > #include <linux/netdevice.h> > #include <linux/rtnetlink.h> > > +#include <net/page_pool/helpers.h> > +#include <net/netlink.h> > + > +#include <trace/events/page_pool.h> > + > #include <uapi/linux/io_uring.h> > > #include "io_uring.h" > @@ -14,8 +21,92 @@ > #include "zcrx.h" > #include "rsrc.h" > > +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) > + > +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, > + struct io_zcrx_area *area, int nr_mapped) > +{ > + struct device *dev = ifq->dev->dev.parent; > + int i; > + > + for (i = 0; i < nr_mapped; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + dma_addr_t dma; > + > + dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); > + dma_unmap_page_attrs(dev, dma, PAGE_SIZE, DMA_FROM_DEVICE, > + IO_DMA_ATTR); > + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), 0); > + } > +} > + > +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) > +{ > + if (area->is_mapped) > + __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); > +} > + > +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) > +{ > + struct device *dev = ifq->dev->dev.parent; > + int i; > + > + if (!dev) > + return -EINVAL; > + > + for (i = 0; i < area->nia.num_niovs; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + dma_addr_t dma; > + > + dma = dma_map_page_attrs(dev, area->pages[i], 0, PAGE_SIZE, > + DMA_FROM_DEVICE, IO_DMA_ATTR); > + if (dma_mapping_error(dev, dma)) > + break; > + if (page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), dma)) { > + dma_unmap_page_attrs(dev, dma, PAGE_SIZE, > + DMA_FROM_DEVICE, IO_DMA_ATTR); > + break; > + } > + } > + > + if (i != area->nia.num_niovs) { > + __io_zcrx_unmap_area(ifq, area, i); > + return -EINVAL; > + } > + > + area->is_mapped = true; > + return 0; > +} > + > #define IO_RQ_MAX_ENTRIES 32768 > > +__maybe_unused > +static const struct memory_provider_ops io_uring_pp_zc_ops; > + > +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) > +{ > + struct net_iov_area *owner = net_iov_owner(niov); > + > + return container_of(owner, struct io_zcrx_area, nia); > +} > + > +static inline atomic_t *io_get_user_counter(struct net_iov *niov) > +{ > + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); > + > + return &area->user_refs[net_iov_idx(niov)]; > +} > + > +static bool io_zcrx_put_niov_uref(struct net_iov *niov) > +{ > + atomic_t *uref = io_get_user_counter(niov); > + > + if (unlikely(!atomic_read(uref))) > + return false; > + atomic_dec(uref); > + return true; > +} > + > static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, > struct io_uring_zcrx_ifq_reg *reg, > struct io_uring_region_desc *rd) > @@ -49,8 +140,11 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) > > static void io_zcrx_free_area(struct io_zcrx_area *area) > { > + io_zcrx_unmap_area(area->ifq, area); > + > kvfree(area->freelist); > kvfree(area->nia.niovs); > + kvfree(area->user_refs); > if (area->pages) { > unpin_user_pages(area->pages, area->nia.num_niovs); > kvfree(area->pages); > @@ -106,6 +200,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, > for (i = 0; i < nr_pages; i++) > area->freelist[i] = i; > > + area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->user_refs) > + goto err; > + > + for (i = 0; i < nr_pages; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + > + niov->owner = &area->nia; > + area->freelist[i] = i; > + atomic_set(&area->user_refs[i], 0); > + } > + > area->free_count = nr_pages; > area->ifq = ifq; > /* we're only supporting one area per ifq for now */ > @@ -130,6 +237,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) > > ifq->if_rxq = -1; > ifq->ctx = ctx; > + spin_lock_init(&ifq->rq_lock); > return ifq; > } > > @@ -205,6 +313,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > if (!ifq->dev) > goto err; > > + ret = io_zcrx_map_area(ifq, ifq->area); > + if (ret) > + goto err; > + > reg.offsets.rqes = sizeof(struct io_uring); > reg.offsets.head = offsetof(struct io_uring, head); > reg.offsets.tail = offsetof(struct io_uring, tail); > @@ -238,7 +350,215 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) > io_zcrx_ifq_free(ifq); > } > > +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) > +{ > + unsigned niov_idx; > + > + lockdep_assert_held(&area->freelist_lock); > + > + niov_idx = area->freelist[--area->free_count]; > + return &area->nia.niovs[niov_idx]; > +} > + > +static void io_zcrx_return_niov_freelist(struct net_iov *niov) > +{ > + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); > + > + spin_lock_bh(&area->freelist_lock); > + area->freelist[area->free_count++] = net_iov_idx(niov); > + spin_unlock_bh(&area->freelist_lock); > +} > + > +static void io_zcrx_return_niov(struct net_iov *niov) > +{ > + netmem_ref netmem = net_iov_to_netmem(niov); > + > + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); > +} > + > +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) > +{ > + struct io_zcrx_area *area = ifq->area; > + int i; > + > + if (!area) > + return; > + > + /* Reclaim back all buffers given to the user space. */ > + for (i = 0; i < area->nia.num_niovs; i++) { > + struct net_iov *niov = &area->nia.niovs[i]; > + int nr; > + > + if (!atomic_read(io_get_user_counter(niov))) > + continue; > + nr = atomic_xchg(io_get_user_counter(niov), 0); > + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) > + io_zcrx_return_niov(niov); > + } > +} > + > void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) > { > lockdep_assert_held(&ctx->uring_lock); > + > + if (ctx->ifq) > + io_zcrx_scrub(ctx->ifq); > +} > + > +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) > +{ > + u32 entries; > + > + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; > + return min(entries, ifq->rq_entries); > +} > + > +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, > + unsigned mask) > +{ > + unsigned int idx = ifq->cached_rq_head++ & mask; > + > + return &ifq->rqes[idx]; > +} > + > +static void io_zcrx_ring_refill(struct page_pool *pp, > + struct io_zcrx_ifq *ifq) > +{ > + unsigned int mask = ifq->rq_entries - 1; > + unsigned int entries; > + netmem_ref netmem; > + > + spin_lock_bh(&ifq->rq_lock); > + > + entries = io_zcrx_rqring_entries(ifq); > + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); > + if (unlikely(!entries)) { > + spin_unlock_bh(&ifq->rq_lock); > + return; > + } > + > + do { > + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); > + struct io_zcrx_area *area; > + struct net_iov *niov; > + unsigned niov_idx, area_idx; > + > + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; > + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; > + > + if (unlikely(rqe->__pad || area_idx)) > + continue; > + area = ifq->area; > + > + if (unlikely(niov_idx >= area->nia.num_niovs)) > + continue; > + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); > + > + niov = &area->nia.niovs[niov_idx]; > + if (!io_zcrx_put_niov_uref(niov)) > + continue; I have a suspicion that uref is now redundant in this series, although I'm not 100% sure. You seem to acquire a uref and pp_ref in tandem in io_zcrx_recv_frag and drop both in tandem in this function, which makes me think the uref maybe is redundant now. io_zcrx_copy_chunk acquires a uref but not a pp_ref. I wonder if copy_chunk can do a page_pool_ref_netmem() instead of a uref, maybe you would be able to make do without urefs at all. I have not looked at the copy fallback code closely. > + > + netmem = net_iov_to_netmem(niov); > + if (page_pool_unref_netmem(netmem, 1) != 0) > + continue; > + > + if (unlikely(niov->pp != pp)) {