Hi, > -----Original Message----- > From: David Wei <dw@xxxxxxxxxxx> > Sent: Friday, January 17, 2025 7:17 AM > To: io-uring@xxxxxxxxxxxxxxx; netdev@xxxxxxxxxxxxxxx > Cc: Jens Axboe <axboe@xxxxxxxxx>; Pavel Begunkov <asml.silence@xxxxxxxxx>; > Jakub Kicinski <kuba@xxxxxxxxxx>; Paolo Abeni <pabeni@xxxxxxxxxx>; David S. > Miller <davem@xxxxxxxxxxxxx>; Eric Dumazet <edumazet@xxxxxxxxxx>; > Jesper Dangaard Brouer <hawk@xxxxxxxxxx>; David Ahern > <dsahern@xxxxxxxxxx>; Mina Almasry <almasrymina@xxxxxxxxxx>; Stanislav > Fomichev <stfomichev@xxxxxxxxx>; Joe Damato <jdamato@xxxxxxxxxx>; > Pedro Tammela <pctammela@xxxxxxxxxxxx> > Subject: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area > > Add io_zcrx_area that represents a region of userspace memory that is used for > zero copy. During ifq registration, userspace passes in the uaddr and len of > userspace memory, which is then pinned by the kernel. > Each net_iov is mapped to one of these pages. > > The freelist is a spinlock protected list that keeps track of all the net_iovs/pages > that aren't used. > > For now, there is only one area per ifq and area registration happens implicitly > as part of ifq registration. There is no API for adding/removing areas yet. The > struct for area registration is there for future extensibility once we support > multiple areas and TCP devmem. > > Reviewed-by: Jens Axboe <axboe@xxxxxxxxx> > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > Signed-off-by: David Wei <dw@xxxxxxxxxxx> > --- > include/uapi/linux/io_uring.h | 9 ++++ > io_uring/rsrc.c | 2 +- > io_uring/rsrc.h | 1 + > io_uring/zcrx.c | 89 > ++++++++++++++++++++++++++++++++++- > io_uring/zcrx.h | 16 +++++++ > 5 files changed, 114 insertions(+), 3 deletions(-) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index > 3af8b7a19824..e251f28507ce 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -980,6 +980,15 @@ struct io_uring_zcrx_offsets { > __u64 __resv[2]; > }; > > +struct io_uring_zcrx_area_reg { > + __u64 addr; > + __u64 len; > + __u64 rq_area_token; > + __u32 flags; > + __u32 __resv1; > + __u64 __resv2[2]; > +}; > + > /* > * Argument for IORING_REGISTER_ZCRX_IFQ > */ > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..d0f11b5aec0d > 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, > unsigned long nr_pages) > return 0; > } > > -static int io_buffer_validate(struct iovec *iov) > +int io_buffer_validate(struct iovec *iov) > { > unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); > > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8b093584461..0ae54ddeb1fd > 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -66,6 +66,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void > __user *arg, > unsigned size, unsigned type); > int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, > unsigned int size, unsigned int type); > +int io_buffer_validate(struct iovec *iov); > > bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, > struct io_imu_folio_data *data); diff --git > a/io_uring/zcrx.c b/io_uring/zcrx.c index f3ace7e8264d..04883a3ae80c 100644 > --- a/io_uring/zcrx.c > +++ b/io_uring/zcrx.c > @@ -10,6 +10,7 @@ > #include "kbuf.h" > #include "memmap.h" > #include "zcrx.h" > +#include "rsrc.h" > > #define IO_RQ_MAX_ENTRIES 32768 > > @@ -44,6 +45,79 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) > ifq->rqes = NULL; > } > > +static void io_zcrx_free_area(struct io_zcrx_area *area) { > + kvfree(area->freelist); > + kvfree(area->nia.niovs); > + if (area->pages) { > + unpin_user_pages(area->pages, area->nia.num_niovs); > + kvfree(area->pages); > + } > + kfree(area); > +} > + > +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, > + struct io_zcrx_area **res, > + struct io_uring_zcrx_area_reg *area_reg) { > + struct io_zcrx_area *area; > + int i, ret, nr_pages; > + struct iovec iov; > + > + if (area_reg->flags || area_reg->rq_area_token) > + return -EINVAL; > + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) > + return -EINVAL; > + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) > + return -EINVAL; > + > + iov.iov_base = u64_to_user_ptr(area_reg->addr); > + iov.iov_len = area_reg->len; > + ret = io_buffer_validate(&iov); > + if (ret) > + return ret; > + > + ret = -ENOMEM; > + area = kzalloc(sizeof(*area), GFP_KERNEL); > + if (!area) > + goto err; > + > + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, > + &nr_pages); > + if (IS_ERR(area->pages)) { > + ret = PTR_ERR(area->pages); > + area->pages = NULL; > + goto err; > + } > + area->nia.num_niovs = nr_pages; > + > + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->nia.niovs) > + goto err; > + > + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->freelist) > + goto err; > + > + for (i = 0; i < nr_pages; i++) > + area->freelist[i] = i; This is redundant as patch 14 will reinitialize it. > + > + area->free_count = nr_pages; > + area->ifq = ifq; > + /* we're only supporting one area per ifq for now */ > + area->area_id = 0; > + area_reg->rq_area_token = (u64)area->area_id << > IORING_ZCRX_AREA_SHIFT; > + spin_lock_init(&area->freelist_lock); > + *res = area; > + return 0; > +err: > + if (area) > + io_zcrx_free_area(area); > + return ret; > +} > + > static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) { > struct io_zcrx_ifq *ifq; > @@ -59,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct > io_ring_ctx *ctx) > > static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { > + if (ifq->area) > + io_zcrx_free_area(ifq->area); > + > io_free_rbuf_ring(ifq); > kfree(ifq); > } > @@ -66,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) int > io_register_zcrx_ifq(struct io_ring_ctx *ctx, > struct io_uring_zcrx_ifq_reg __user *arg) { > + struct io_uring_zcrx_area_reg area; > struct io_uring_zcrx_ifq_reg reg; > struct io_uring_region_desc rd; > struct io_zcrx_ifq *ifq; > @@ -99,7 +177,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > } > reg.rq_entries = roundup_pow_of_two(reg.rq_entries); > > - if (!reg.area_ptr) > + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), > +sizeof(area))) > return -EFAULT; > > ifq = io_zcrx_ifq_alloc(ctx); > @@ -110,6 +188,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > if (ret) > goto err; > > + ret = io_zcrx_create_area(ifq, &ifq->area, &area); > + if (ret) > + goto err; > + > ifq->rq_entries = reg.rq_entries; > ifq->if_rxq = reg.if_rxq; > > @@ -122,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, > ret = -EFAULT; > goto err; > } > - > + if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { > + ret = -EFAULT; > + goto err; > + } > ctx->ifq = ifq; > return 0; > err: > diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index > 58e4ab6c6083..53fd94b65b38 100644 > --- a/io_uring/zcrx.h > +++ b/io_uring/zcrx.h > @@ -3,9 +3,25 @@ > #define IOU_ZC_RX_H > > #include <linux/io_uring_types.h> > +#include <net/page_pool/types.h> > + > +struct io_zcrx_area { > + struct net_iov_area nia; > + struct io_zcrx_ifq *ifq; > + > + u16 area_id; > + struct page **pages; > + > + /* freelist */ > + spinlock_t freelist_lock ____cacheline_aligned_in_smp; > + u32 free_count; > + u32 *freelist; > +}; > > struct io_zcrx_ifq { > struct io_ring_ctx *ctx; > + struct io_zcrx_area *area; > + > struct io_uring *rq_ring; > struct io_uring_zcrx_rqe *rqes; > u32 rq_entries; > -- > 2.43.5 > > -- Li Zetao