On Tue, Feb 18, 2025 at 2:43 PM Keith Busch <kbusch@xxxxxxxx> wrote: > > From: Keith Busch <kbusch@xxxxxxxxxx> > > Frequent alloc/free cycles on these is pretty costly. Use an io cache to > more efficiently reuse these buffers. > > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > --- > include/linux/io_uring_types.h | 18 ++--- > io_uring/filetable.c | 2 +- > io_uring/rsrc.c | 120 +++++++++++++++++++++++++-------- > io_uring/rsrc.h | 2 +- > 4 files changed, 104 insertions(+), 38 deletions(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 810d1dccd27b1..bbfb651506522 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -69,8 +69,18 @@ struct io_file_table { > unsigned int alloc_hint; > }; > > +struct io_alloc_cache { > + void **entries; > + unsigned int nr_cached; > + unsigned int max_cached; > + size_t elem_size; Is growing this field from unsigned to size_t really necessary? It probably doesn't make sense to be caching allocations > 4 GB. > + unsigned int init_clear; > +}; > + > struct io_buf_table { > struct io_rsrc_data data; > + struct io_alloc_cache node_cache; > + struct io_alloc_cache imu_cache; > }; > > struct io_hash_bucket { > @@ -224,14 +234,6 @@ struct io_submit_state { > struct blk_plug plug; > }; > > -struct io_alloc_cache { > - void **entries; > - unsigned int nr_cached; > - unsigned int max_cached; > - unsigned int elem_size; > - unsigned int init_clear; > -}; > - > struct io_ring_ctx { > /* const or read-mostly hot data */ > struct { > diff --git a/io_uring/filetable.c b/io_uring/filetable.c > index dd8eeec97acf6..a21660e3145ab 100644 > --- a/io_uring/filetable.c > +++ b/io_uring/filetable.c > @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, > if (slot_index >= ctx->file_table.data.nr) > return -EINVAL; > > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > if (!node) > return -ENOMEM; > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 261b5535f46c6..d5cac3a234316 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > #define IORING_MAX_FIXED_FILES (1U << 20) > #define IORING_MAX_REG_BUFFERS (1U << 14) > > +#define IO_CACHED_BVECS_SEGS 32 > + > int __io_account_mem(struct user_struct *user, unsigned long nr_pages) > { > unsigned long page_limit, cur_pages, new_pages; > @@ -101,6 +103,22 @@ int io_buffer_validate(struct iovec *iov) > return 0; > } > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, > + int nr_bvecs) Pass nr_bvecs as unsigned to avoid sign-extension in struct_size_t()? > +{ > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL); > + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), > + GFP_KERNEL); > +} > + > +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) > +{ > + if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS || > + !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu)) > + kvfree(imu); > +} > + > static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > { > struct io_mapped_ubuf *imu = node->buf; > @@ -119,22 +137,35 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > io_unaccount_mem(ctx, imu->acct_pages); > } > > - kvfree(imu); > + io_free_imu(ctx, imu); > } > > -struct io_rsrc_node *io_rsrc_node_alloc(int type) > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) > { > struct io_rsrc_node *node; > > - node = kzalloc(sizeof(*node), GFP_KERNEL); > + if (type == IORING_RSRC_FILE) > + node = kmalloc(sizeof(*node), GFP_KERNEL); > + else > + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL); > if (node) { > node->type = type; > node->refs = 1; > + node->tag = 0; > + node->file_ptr = 0; > } > return node; > } > > -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) > +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data) > +{ > + kvfree(data->nodes); > + data->nodes = NULL; > + data->nr = 0; > +} > + > +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, > + struct io_rsrc_data *data) > { > if (!data->nr) > return; > @@ -142,9 +173,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data > if (data->nodes[data->nr]) > io_put_rsrc_node(ctx, data->nodes[data->nr]); > } > - kvfree(data->nodes); > - data->nodes = NULL; > - data->nr = 0; > + __io_rsrc_data_free(data); > } > > __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) > @@ -158,6 +187,31 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) > return -ENOMEM; > } > > +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr) > +{ > + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, > + IO_CACHED_BVECS_SEGS); > + const int node_size = sizeof(struct io_rsrc_node); > + int ret; > + > + ret = io_rsrc_data_alloc(&table->data, nr); > + if (ret) > + return ret; > + > + if (io_alloc_cache_init(&table->node_cache, nr, node_size, 0)) > + goto free_data; > + > + if (io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0)) > + goto free_cache; > + > + return 0; > +free_cache: > + io_alloc_cache_free(&table->node_cache, kfree); > +free_data: > + __io_rsrc_data_free(&table->data); > + return -ENOMEM; > +} > + > static int __io_sqe_files_update(struct io_ring_ctx *ctx, > struct io_uring_rsrc_update2 *up, > unsigned nr_args) > @@ -207,7 +261,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, > err = -EBADF; > break; > } > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > if (!node) { > err = -ENOMEM; > fput(file); > @@ -459,6 +513,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > case IORING_RSRC_BUFFER: > if (node->buf) > io_buffer_unmap(ctx, node); > + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node)) > + return; > break; > default: > WARN_ON_ONCE(1); > @@ -527,7 +583,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, > goto fail; > } > ret = -ENOMEM; > - node = io_rsrc_node_alloc(IORING_RSRC_FILE); > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); > if (!node) { > fput(file); > goto fail; > @@ -547,11 +603,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, > return ret; > } > > +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx, > + struct io_buf_table *table) > +{ > + io_rsrc_data_free(ctx, &table->data); > + io_alloc_cache_free(&table->node_cache, kfree); > + io_alloc_cache_free(&table->imu_cache, kfree); > +} > + > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) > { > if (!ctx->buf_table.data.nr) > return -ENXIO; > - io_rsrc_data_free(ctx, &ctx->buf_table.data); > + io_rsrc_buffer_free(ctx, &ctx->buf_table); > return 0; > } > > @@ -732,7 +796,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > if (!iov->iov_base) > return NULL; > > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > if (!node) > return ERR_PTR(-ENOMEM); > node->buf = NULL; > @@ -752,7 +816,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); > } > > - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); > + imu = io_alloc_imu(ctx, nr_pages); > if (!imu) > goto done; > > @@ -789,7 +853,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > } > done: > if (ret) { > - kvfree(imu); > + io_free_imu(ctx, imu); > if (node) > io_put_rsrc_node(ctx, node); > node = ERR_PTR(ret); > @@ -802,9 +866,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > unsigned int nr_args, u64 __user *tags) > { > struct page *last_hpage = NULL; > - struct io_rsrc_data data; > struct iovec fast_iov, *iov = &fast_iov; > const struct iovec __user *uvec; > + struct io_buf_table table; > int i, ret; > > BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); > @@ -813,13 +877,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > return -EBUSY; > if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) > return -EINVAL; > - ret = io_rsrc_data_alloc(&data, nr_args); > + ret = io_rsrc_buffer_alloc(&table, nr_args); > if (ret) > return ret; > > if (!arg) > memset(iov, 0, sizeof(*iov)); > > + ctx->buf_table = table; > for (i = 0; i < nr_args; i++) { > struct io_rsrc_node *node; > u64 tag = 0; > @@ -859,10 +924,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > } > node->tag = tag; > } > - data.nodes[i] = node; > + table.data.nodes[i] = node; > } > - > - ctx->buf_table.data = data; Still don't see the need to move this assignment. Is there a reason you prefer setting ctx->buf_table before initializing its nodes? I find the existing code easier to follow, where the table is moved to ctx->buf_table after filling it in. It's also consistent with io_clone_buffers(). > if (ret) > io_sqe_buffers_unregister(ctx); > return ret; > @@ -887,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, > goto unlock; > } > > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > if (!node) { > ret = -ENOMEM; > goto unlock; > } > > nr_bvecs = blk_rq_nr_phys_segments(rq); > - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); > + > + imu = io_alloc_imu(ctx, nr_bvecs); nit: probably don't need to add a blank line here Best, Caleb > if (!imu) { > kfree(node); > ret = -ENOMEM; > @@ -1031,7 +1095,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) > static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, > struct io_uring_clone_buffers *arg) > { > - struct io_rsrc_data data; > + struct io_buf_table table; > int i, ret, off, nr; > unsigned int nbufs; > > @@ -1062,7 +1126,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) > return -EOVERFLOW; > > - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr)); > + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr)); > if (ret) > return ret; > > @@ -1071,7 +1135,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i]; > > if (src_node) { > - data.nodes[i] = src_node; > + table.data.nodes[i] = src_node; > src_node->refs++; > } > } > @@ -1101,7 +1165,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > if (!src_node) { > dst_node = NULL; > } else { > - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); > if (!dst_node) { > ret = -ENOMEM; > goto out_free; > @@ -1110,12 +1174,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > refcount_inc(&src_node->buf->refs); > dst_node->buf = src_node->buf; > } > - data.nodes[off++] = dst_node; > + table.data.nodes[off++] = dst_node; > i++; > } > > /* > - * If asked for replace, put the old table. data->nodes[] holds both > + * If asked for replace, put the old table. table.data->nodes[] holds both > * old and new nodes at this point. > */ > if (arg->flags & IORING_REGISTER_DST_REPLACE) > @@ -1128,10 +1192,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx > * entry). > */ > WARN_ON_ONCE(ctx->buf_table.data.nr); > - ctx->buf_table.data = data; > + ctx->buf_table = table; > return 0; > out_free: > - io_rsrc_data_free(ctx, &data); > + io_rsrc_buffer_free(ctx, &table); > return ret; > } > > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index 81c31b93d4f7b..69cc212ad0c7c 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -49,7 +49,7 @@ struct io_imu_folio_data { > unsigned int nr_folios; > }; > > -struct io_rsrc_node *io_rsrc_node_alloc(int type); > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); > void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); > void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); > int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); > -- > 2.43.5 >