On Fri, Feb 14, 2025 at 6:22 PM Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> wrote: > > On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <kbusch@xxxxxxxx> wrote: > > > > From: Keith Busch <kbusch@xxxxxxxxxx> > > > > Frequent alloc/free cycles on these is pretty costly. Use an io cache to > > more efficiently reuse these buffers. > > > > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > > --- > > include/linux/io_uring_types.h | 18 +++--- > > io_uring/filetable.c | 2 +- > > io_uring/rsrc.c | 114 +++++++++++++++++++++++++-------- > > io_uring/rsrc.h | 2 +- > > 4 files changed, 99 insertions(+), 37 deletions(-) > > > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > > index d8d717cce427f..ebaaa1c7e210f 100644 > > --- a/include/linux/io_uring_types.h > > +++ b/include/linux/io_uring_types.h > > @@ -67,8 +67,18 @@ struct io_file_table { > > unsigned int alloc_hint; > > }; > > > > +struct io_alloc_cache { > > + void **entries; > > + unsigned int nr_cached; > > + unsigned int max_cached; > > + size_t elem_size; > > + unsigned int init_clear; > > +}; > > + > > struct io_buf_table { > > struct io_rsrc_data data; > > + struct io_alloc_cache node_cache; > > + struct io_alloc_cache imu_cache; > > }; > > > > struct io_hash_bucket { > > @@ -222,14 +232,6 @@ struct io_submit_state { > > struct blk_plug plug; > > }; > > > > -struct io_alloc_cache { > > - void **entries; > > - unsigned int nr_cached; > > - unsigned int max_cached; > > - unsigned int elem_size; > > - unsigned int init_clear; > > -}; > > - > > struct io_ring_ctx { > > /* const or read-mostly hot data */ > > struct { > > diff --git a/io_uring/filetable.c b/io_uring/filetable.c > > index dd8eeec97acf6..a21660e3145ab 100644 > > --- a/io_uring/filetable.c > > +++ b/io_uring/filetable.c > > @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, > > if (slot_index >= ctx->file_table.data.nr) > > return -EINVAL; > > > > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > > if (!node) > > return -ENOMEM; > > > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > > index fd7a1b04db8b7..26ff9b5851d94 100644 > > --- a/io_uring/rsrc.c > > +++ b/io_uring/rsrc.c > > @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > > #define IORING_MAX_FIXED_FILES (1U << 20) > > #define IORING_MAX_REG_BUFFERS (1U << 14) > > > > +#define IO_CACHED_BVECS_SEGS 32 > > + > > int __io_account_mem(struct user_struct *user, unsigned long nr_pages) > > { > > unsigned long page_limit, cur_pages, new_pages; > > @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > > kvfree(imu); > > } > > > > -struct io_rsrc_node *io_rsrc_node_alloc(int type) > > + > > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) > > nit: extra blank line added here > > > { > > struct io_rsrc_node *node; > > > > - node = kzalloc(sizeof(*node), GFP_KERNEL); > > + if (type == IORING_RSRC_FILE) > > + node = kmalloc(sizeof(*node), GFP_KERNEL); > > + else > > + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL); > > if (node) { > > node->type = type; > > node->refs = 1; > > + node->tag = 0; > > + node->file_ptr = 0; > > } > > return node; > > } > > > > -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) > > +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data) > > +{ > > + kvfree(data->nodes); > > + data->nodes = NULL; > > + data->nr = 0; > > +} > > + > > +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, > > + struct io_rsrc_data *data) > > { > > if (!data->nr) > > return; > > @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data > > if (data->nodes[data->nr]) > > io_put_rsrc_node(ctx, data->nodes[data->nr]); > > } > > - kvfree(data->nodes); > > - data->nodes = NULL; > > - data->nr = 0; > > + __io_rsrc_data_free(data); > > } > > > > __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) > > @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) > > return -ENOMEM; > > } > > > > +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr) > > +{ > > + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, > > + IO_CACHED_BVECS_SEGS); > > + int ret; > > + > > + ret = io_rsrc_data_alloc(&table->data, nr); > > + if (ret) > > + return ret; > > + > > + ret = io_alloc_cache_init(&table->node_cache, nr, > > + sizeof(struct io_rsrc_node), 0); > > + if (ret) > > + goto out_1; > > + > > + ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0); > > + if (ret) > > + goto out_2; > > io_alloc_cache_init() returns bool. Probably these cases should return > -ENOMEM instead of 1? > > > + > > + return 0; > > +out_2: > > + io_alloc_cache_free(&table->node_cache, kfree); > > +out_1: > > + __io_rsrc_data_free(&table->data); > > + return ret; > > +} > > + > > static int __io_sqe_files_update(struct io_ring_ctx *ctx, > > struct io_uring_rsrc_update2 *up, > > unsigned nr_args) > > @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, > > err = -EBADF; > > break; > > } > > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > > if (!node) { > > err = -ENOMEM; > > fput(file); > > @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, > > } > > node->tag = tag; > > } > > - i = array_index_nospec(up->offset + done, ctx->buf_table.nr); > > + i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr); > > Looks like this change belongs in the prior patch "io_uring: add > abstraction for buf_table rsrc data"? > > > io_reset_rsrc_node(ctx, &ctx->buf_table.data, i); > > ctx->buf_table.data.nodes[i] = node; > > if (ctx->compat) > > @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > > case IORING_RSRC_BUFFER: > > if (node->buf) > > io_buffer_unmap(ctx, node); > > + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node)) > > + return; > > break; > > default: > > WARN_ON_ONCE(1); > > @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, > > goto fail; > > } > > ret = -ENOMEM; > > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > > if (!node) { > > fput(file); > > goto fail; > > @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, > > return ret; > > } > > > > +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx, > > + struct io_buf_table *table) > > +{ > > + io_rsrc_data_free(ctx, &table->data); > > + io_alloc_cache_free(&table->node_cache, kfree); > > + io_alloc_cache_free(&table->imu_cache, kfree); > > +} > > + > > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) > > { > > if (!ctx->buf_table.data.nr) > > return -ENXIO; > > - io_rsrc_data_free(ctx, &ctx->buf_table.data); > > + io_rsrc_buffer_free(ctx, &ctx->buf_table); > > return 0; > > } > > > > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, > > return true; > > } > > > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, > > + int nr_bvecs) > > +{ > > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) > > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL); > > If there is no entry available in the cache, this will heap-allocate > one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider > using io_alloc_cache_get() instead of io_cache_alloc(), so the > heap-allocated fallback uses the minimal size. > > Also, where are these allocations returned to the imu_cache? Looks > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register() > needs to try io_alloc_cache_put() first. Another issue I see is that io_alloc_cache elements are allocated with kmalloc(), so they can't be freed with kvfree(). When the imu is freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree(). > > Best, > Caleb > > > + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), > > + GFP_KERNEL); > > +} > > + > > static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > > struct iovec *iov, > > struct page **last_hpage) > > @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > > if (!iov->iov_base) > > return NULL; > > > > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > > if (!node) > > return ERR_PTR(-ENOMEM); > > node->buf = NULL; > > @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > > coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); > > } > > > > - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); > > + imu = io_alloc_imu(ctx, nr_pages); > > if (!imu) > > goto done; > > > > @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > > unsigned int nr_args, u64 __user *tags) > > { > > struct page *last_hpage = NULL; > > - struct io_rsrc_data data; > > struct iovec fast_iov, *iov = &fast_iov; > > const struct iovec __user *uvec; > > + struct io_buf_table table; > > int i, ret; > > > > BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); > > @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > > return -EBUSY; > > if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) > > return -EINVAL; > > - ret = io_rsrc_data_alloc(&data, nr_args); > > + ret = io_rsrc_buffer_alloc(&table, nr_args); > > if (ret) > > return ret; > > > > if (!arg) > > memset(iov, 0, sizeof(*iov)); > > > > + ctx->buf_table = table; > > for (i = 0; i < nr_args; i++) { > > struct io_rsrc_node *node; > > u64 tag = 0; > > @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > > } > > node->tag = tag; > > } > > - data.nodes[i] = node; > > + table.data.nodes[i] = node; > > } > > - > > - ctx->buf_table.data = data; > > Is it necessary to move this assignment? I found the existing location > easier to reason about, since the assignment of ctx->buf_table > represents a transfer of ownership from the local variable. > > > if (ret) > > io_sqe_buffers_unregister(ctx); > > return ret; > > @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, > > goto unlock; > > } > > > > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > > if (!node) { > > ret = -ENOMEM; > > goto unlock; > > } > > > > nr_bvecs = blk_rq_nr_phys_segments(rq); > > - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); > > + > > + imu = io_alloc_imu(ctx, nr_bvecs); > > if (!imu) { > > kfree(node); > > ret = -ENOMEM; > > @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) > > static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, > > struct io_uring_clone_buffers *arg) > > { > > - struct io_rsrc_data data; > > + struct io_buf_table table; > > int i, ret, off, nr; > > unsigned int nbufs; > > > > @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > > if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) > > return -EOVERFLOW; > > > > - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr)); > > + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr)); > > if (ret) > > return ret; > > > > @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > > struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i]; > > > > if (src_node) { > > - data.nodes[i] = src_node; > > + table.data.nodes[i] = src_node; > > src_node->refs++; > > } > > } > > @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > > if (!src_node) { > > dst_node = NULL; > > } else { > > - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > > + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > > if (!dst_node) { > > ret = -ENOMEM; > > goto out_free; > > @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > > refcount_inc(&src_node->buf->refs); > > dst_node->buf = src_node->buf; > > } > > - data.nodes[off++] = dst_node; > > + table.data.nodes[off++] = dst_node; > > i++; > > } > > > > /* > > - * If asked for replace, put the old table. data->nodes[] holds both > > + * If asked for replace, put the old table. table.data->nodes[] holds both > > * old and new nodes at this point. > > */ > > if (arg->flags & IORING_REGISTER_DST_REPLACE) > > @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > > * entry). > > */ > > WARN_ON_ONCE(ctx->buf_table.data.nr); > > - ctx->buf_table.data = data; > > + ctx->buf_table = table; > > return 0; > > out_free: > > - io_rsrc_data_free(ctx, &data); > > + io_rsrc_buffer_free(ctx, &table); > > return ret; > > } > > > > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > > index 2e8d1862caefc..c5bdac558a2b4 100644 > > --- a/io_uring/rsrc.h > > +++ b/io_uring/rsrc.h > > @@ -47,7 +47,7 @@ struct io_imu_folio_data { > > unsigned int nr_folios; > > }; > > > > -struct io_rsrc_node *io_rsrc_node_alloc(int type); > > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); > > void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); > > void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); > > int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); > > -- > > 2.43.5 > > > >