From: David Wei <davidhwei@xxxxxxxx> [TODO: REVIEW COMMIT MESSAGE] This patch adds two objects: * Zero copy buffer representation, holding a page, its mapped dma_addr, and a refcount for lifetime management. * Zero copy pool, spiritually similar to page pool, that holds ZC bufs and hands them out to net devices. Pool regions are registered w/ io_uring using the registered buffer API, with a 1:1 mapping between region and nr_iovec in io_uring_register_buffers. This does the heavy lifting of pinning and chunking into bvecs into a struct io_mapped_ubuf for us. For now as there is only one pool region per ifq, there is no separate API for adding/removing regions yet and it is mapped implicitly during ifq registration. Signed-off-by: David Wei <dw@xxxxxxxxxxx> --- include/linux/io_uring/net.h | 8 +++ io_uring/zc_rx.c | 135 ++++++++++++++++++++++++++++++++++- io_uring/zc_rx.h | 15 ++++ 3 files changed, 157 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h index b58f39fed4d5..d994d26116d0 100644 --- a/include/linux/io_uring/net.h +++ b/include/linux/io_uring/net.h @@ -2,8 +2,16 @@ #ifndef _LINUX_IO_URING_NET_H #define _LINUX_IO_URING_NET_H +#include <net/page_pool/types.h> + struct io_uring_cmd; +struct io_zc_rx_buf { + struct page_pool_iov ppiov; + struct page *page; + dma_addr_t dma; +}; + #if defined(CONFIG_IO_URING) int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c index 06e2c54d3f3d..1e656b481725 100644 --- a/io_uring/zc_rx.c +++ b/io_uring/zc_rx.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/io_uring.h> #include <linux/netdevice.h> +#include <linux/nospec.h> #include <uapi/linux/io_uring.h> @@ -15,6 +16,11 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +static inline struct device *netdev2dev(struct net_device *dev) +{ + return dev->dev.parent; +} + static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq, u16 queue_id) { @@ -67,6 +73,129 @@ static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq) folio_put(virt_to_folio(ifq->ring)); } +static int io_zc_rx_init_buf(struct device *dev, struct page *page, u16 pool_id, + u32 pgid, struct io_zc_rx_buf *buf) +{ + dma_addr_t addr = 0; + + /* Skip dma setup for devices that don't do any DMA transfers */ + if (dev) { + addr = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, + DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(dev, addr)) + return -ENOMEM; + } + + buf->dma = addr; + buf->page = page; + refcount_set(&buf->ppiov.refcount, 0); + buf->ppiov.owner = NULL; + buf->ppiov.pp = NULL; + get_page(page); + return 0; +} + +static void io_zc_rx_free_buf(struct device *dev, struct io_zc_rx_buf *buf) +{ + struct page *page = buf->page; + + if (dev) + dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, + DMA_ATTR_SKIP_CPU_SYNC); + put_page(page); +} + +static int io_zc_rx_map_pool(struct io_zc_rx_pool *pool, + struct io_mapped_ubuf *imu, + struct device *dev) +{ + struct io_zc_rx_buf *buf; + struct page *page; + int i, ret; + + for (i = 0; i < imu->nr_bvecs; i++) { + page = imu->bvec[i].bv_page; + buf = &pool->bufs[i]; + ret = io_zc_rx_init_buf(dev, page, pool->pool_id, i, buf); + if (ret) + goto err; + + pool->freelist[i] = i; + } + + pool->free_count = imu->nr_bvecs; + return 0; +err: + while (i--) { + buf = &pool->bufs[i]; + io_zc_rx_free_buf(dev, buf); + } + return ret; +} + +static int io_zc_rx_create_pool(struct io_ring_ctx *ctx, + struct io_zc_rx_ifq *ifq, + u16 id) +{ + struct device *dev = netdev2dev(ifq->dev); + struct io_mapped_ubuf *imu; + struct io_zc_rx_pool *pool; + int nr_pages; + int ret; + + if (ifq->pool) + return -EFAULT; + + if (unlikely(id >= ctx->nr_user_bufs)) + return -EFAULT; + id = array_index_nospec(id, ctx->nr_user_bufs); + imu = ctx->user_bufs[id]; + if (imu->ubuf & ~PAGE_MASK || imu->ubuf_end & ~PAGE_MASK) + return -EFAULT; + + ret = -ENOMEM; + nr_pages = imu->nr_bvecs; + pool = kvmalloc(struct_size(pool, freelist, nr_pages), GFP_KERNEL); + if (!pool) + goto err; + + pool->bufs = kvmalloc_array(nr_pages, sizeof(*pool->bufs), GFP_KERNEL); + if (!pool->bufs) + goto err_buf; + + ret = io_zc_rx_map_pool(pool, imu, dev); + if (ret) + goto err_map; + + pool->ifq = ifq; + pool->pool_id = id; + pool->nr_bufs = nr_pages; + spin_lock_init(&pool->freelist_lock); + ifq->pool = pool; + return 0; +err_map: + kvfree(pool->bufs); +err_buf: + kvfree(pool); +err: + return ret; +} + +static void io_zc_rx_destroy_pool(struct io_zc_rx_pool *pool) +{ + struct device *dev = netdev2dev(pool->ifq->dev); + struct io_zc_rx_buf *buf; + + for (int i = 0; i < pool->nr_bufs; i++) { + buf = &pool->bufs[i]; + io_zc_rx_free_buf(dev, buf); + } + kvfree(pool->bufs); + kvfree(pool); +} + static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) { struct io_zc_rx_ifq *ifq; @@ -105,6 +234,8 @@ static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq) { io_shutdown_ifq(ifq); + if (ifq->pool) + io_zc_rx_destroy_pool(ifq->pool); if (ifq->dev) dev_put(ifq->dev); io_free_rbuf_ring(ifq); @@ -141,7 +272,9 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, if (!ifq->dev) goto err; - /* TODO: map zc region and initialise zc pool */ + ret = io_zc_rx_create_pool(ctx, ifq, reg.region_id); + if (ret) + goto err; ifq->rq_entries = reg.rq_entries; ifq->cq_entries = reg.cq_entries; diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h index 9257dda77e92..af1d865525d2 100644 --- a/io_uring/zc_rx.h +++ b/io_uring/zc_rx.h @@ -3,15 +3,30 @@ #define IOU_ZC_RX_H #include <linux/io_uring_types.h> +#include <linux/io_uring/net.h> #include <linux/skbuff.h> #define IO_ZC_MAX_IFQ_SOCKETS 16 #define IO_ZC_IFQ_IDX_OFFSET 16 #define IO_ZC_IFQ_IDX_MASK ((1U << IO_ZC_IFQ_IDX_OFFSET) - 1) +struct io_zc_rx_pool { + struct io_zc_rx_ifq *ifq; + struct io_zc_rx_buf *bufs; + u32 nr_bufs; + u16 pool_id; + + /* freelist */ + spinlock_t freelist_lock; + u32 free_count; + u32 freelist[]; +}; + struct io_zc_rx_ifq { struct io_ring_ctx *ctx; struct net_device *dev; + struct io_zc_rx_pool *pool; + struct io_rbuf_ring *ring; struct io_uring_rbuf_rqe *rqes; struct io_uring_rbuf_cqe *cqes; -- 2.39.3