From: David Wei <davidhwei@xxxxxxxx> This patch introduces a new object in io_uring called an interface queue (ifq) which contains: * A pool region allocated by userspace and registered w/ io_uring where Rx data is transferred to via DMA. * A net device and one specific Rx queue in it that will be configured for ZC Rx. * A new shared ringbuf w/ userspace called a refill ring. When userspace is done with bufs with Rx packet payloads, it writes entries into this ring to tell the kernel that bufs can be re-used by the NIC again. Each entry in the refill ring is a struct io_uring_rbuf_rqe. On the completion side, the main CQ ring is used to notify userspace of recv()'d packets. Big CQEs (32 bytes) are required to support this, as the upper 16 bytes are used by ZC Rx to store a feature specific struct io_uring_rbuf_cqe. Add two new struct types: 1. io_uring_rbuf_rqe - entry in refill ring 2. io_uring_rbuf_cqe - entry in upper 16 bytes of a big CQE For now, each io_uring instance has a single ifq, and each ifq has a single pool region associated with one Rx queue. Add a new opcode and functions to setup and tear down an ifq. Size and offsets of the shared refill ring are returned to userspace for it to mmap in the registration struct io_uring_zc_rx_ifq_reg, similar to the main SQ/CQ rings. Signed-off-by: David Wei <dw@xxxxxxxxxxx> --- include/linux/io_uring_types.h | 4 ++ include/uapi/linux/io_uring.h | 40 ++++++++++++ io_uring/Makefile | 3 +- io_uring/io_uring.c | 7 +++ io_uring/register.c | 7 +++ io_uring/zc_rx.c | 109 +++++++++++++++++++++++++++++++++ io_uring/zc_rx.h | 35 +++++++++++ 7 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 io_uring/zc_rx.c create mode 100644 io_uring/zc_rx.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 500772189fee..27e750a02ea5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -39,6 +39,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct io_zc_rx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -385,6 +387,8 @@ struct io_ring_ctx { struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; + struct io_zc_rx_ifq *ifq; + /* protected by ->uring_lock */ struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7bd10201a02b..7b643fe420c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -575,6 +575,9 @@ enum { IORING_REGISTER_NAPI = 27, IORING_UNREGISTER_NAPI = 28, + /* register a network interface queue for zerocopy */ + IORING_REGISTER_ZC_RX_IFQ = 29, + /* this goes last */ IORING_REGISTER_LAST, @@ -782,6 +785,43 @@ enum { SOCKET_URING_OP_SETSOCKOPT, }; +struct io_uring_rbuf_rqe { + __u32 off; + __u32 len; + __u16 region; + __u8 __pad[6]; +}; + +struct io_uring_rbuf_cqe { + __u32 off; + __u32 len; + __u16 region; + __u8 __pad[6]; +}; + +struct io_rbuf_rqring_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u8 __pad[4]; +}; + +/* + * Argument for IORING_REGISTER_ZC_RX_IFQ + */ +struct io_uring_zc_rx_ifq_reg { + __u32 if_idx; + /* hw rx descriptor ring id */ + __u32 if_rxq_id; + __u32 region_id; + __u32 rq_entries; + __u32 flags; + __u16 cpu; + + __u32 mmap_sz; + struct io_rbuf_rqring_offsets rq_off; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e1d4e03799c..bb47231c611b 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o truncate.o + notif.o waitid.o register.o truncate.o \ + zc_rx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e44c2ef271b9..5614c47cecd9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "waitid.h" #include "futex.h" #include "napi.h" +#include "zc_rx.h" #include "timeout.h" #include "poll.h" @@ -2861,6 +2862,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) return; mutex_lock(&ctx->uring_lock); + io_unregister_zc_rx_ifqs(ctx); if (ctx->buf_data) __io_sqe_buffers_unregister(ctx); if (ctx->file_data) @@ -3032,6 +3034,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zc_rx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/register.c b/io_uring/register.c index 99c37775f974..760f0b6a051c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -27,6 +27,7 @@ #include "cancel.h" #include "kbuf.h" #include "napi.h" +#include "zc_rx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -563,6 +564,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_napi(ctx, arg); break; + case IORING_REGISTER_ZC_RX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zc_rx_ifq(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c new file mode 100644 index 000000000000..e6c33f94c086 --- /dev/null +++ b/io_uring/zc_rx.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#if defined(CONFIG_PAGE_POOL) +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "zc_rx.h" + +static int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq, + struct io_uring_zc_rx_ifq_reg *reg) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t off, rq_size; + void *ptr; + + off = sizeof(struct io_uring); + rq_size = reg->rq_entries * sizeof(struct io_uring_rbuf_rqe); + ptr = (void *) __get_free_pages(gfp, get_order(off + rq_size)); + if (!ptr) + return -ENOMEM; + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_rbuf_rqe *)((char *)ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq) +{ + if (ifq->rq_ring) + folio_put(virt_to_folio(ifq->rq_ring)); +} + +static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zc_rx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq_id = -1; + ifq->ctx = ctx; + return ifq; +} + +static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq) +{ + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zc_rx_ifq_reg __user *arg) +{ + struct io_uring_zc_rx_ifq_reg reg; + struct io_zc_rx_ifq *ifq; + int ret; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (ctx->ifq) + return -EBUSY; + if (reg.if_rxq_id == -1) + return -EINVAL; + + ifq = io_zc_rx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + ifq->if_rxq_id = reg.if_rxq_id; + ctx->ifq = ifq; + + return 0; +err: + io_zc_rx_ifq_free(ifq); + return ret; +} + +void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zc_rx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zc_rx_ifq_free(ifq); +} + +void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); +} + +#endif diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h new file mode 100644 index 000000000000..35b019b275e0 --- /dev/null +++ b/io_uring/zc_rx.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +struct io_zc_rx_ifq { + struct io_ring_ctx *ctx; + struct net_device *dev; + struct io_uring *rq_ring; + struct io_uring_rbuf_rqe *rqes; + u32 rq_entries; + + /* hw rx descriptor ring id */ + u32 if_rxq_id; +}; + +#if defined(CONFIG_PAGE_POOL) +int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zc_rx_ifq_reg __user *arg); +void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx); +#else +static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zc_rx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif -- 2.43.0