Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <kbusch@xxxxxxxx> wrote:
>
> From: Keith Busch <kbusch@xxxxxxxxxx>
>
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
>
> Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx>
> ---
>  include/linux/io_uring_types.h |  18 +++---
>  io_uring/filetable.c           |   2 +-
>  io_uring/rsrc.c                | 114 +++++++++++++++++++++++++--------
>  io_uring/rsrc.h                |   2 +-
>  4 files changed, 99 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index d8d717cce427f..ebaaa1c7e210f 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -67,8 +67,18 @@ struct io_file_table {
>         unsigned int alloc_hint;
>  };
>
> +struct io_alloc_cache {
> +       void                    **entries;
> +       unsigned int            nr_cached;
> +       unsigned int            max_cached;
> +       size_t                  elem_size;
> +       unsigned int            init_clear;
> +};
> +
>  struct io_buf_table {
>         struct io_rsrc_data     data;
> +       struct io_alloc_cache   node_cache;
> +       struct io_alloc_cache   imu_cache;
>  };
>
>  struct io_hash_bucket {
> @@ -222,14 +232,6 @@ struct io_submit_state {
>         struct blk_plug         plug;
>  };
>
> -struct io_alloc_cache {
> -       void                    **entries;
> -       unsigned int            nr_cached;
> -       unsigned int            max_cached;
> -       unsigned int            elem_size;
> -       unsigned int            init_clear;
> -};
> -
>  struct io_ring_ctx {
>         /* const or read-mostly hot data */
>         struct {
> diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> index dd8eeec97acf6..a21660e3145ab 100644
> --- a/io_uring/filetable.c
> +++ b/io_uring/filetable.c
> @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
>         if (slot_index >= ctx->file_table.data.nr)
>                 return -EINVAL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>         if (!node)
>                 return -ENOMEM;
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index fd7a1b04db8b7..26ff9b5851d94 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>  #define IORING_MAX_FIXED_FILES (1U << 20)
>  #define IORING_MAX_REG_BUFFERS (1U << 14)
>
> +#define IO_CACHED_BVECS_SEGS   32
> +
>  int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
>  {
>         unsigned long page_limit, cur_pages, new_pages;
> @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>         kvfree(imu);
>  }
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)

nit: extra blank line added here

>  {
>         struct io_rsrc_node *node;
>
> -       node = kzalloc(sizeof(*node), GFP_KERNEL);
> +       if (type == IORING_RSRC_FILE)
> +               node = kmalloc(sizeof(*node), GFP_KERNEL);
> +       else
> +               node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
>         if (node) {
>                 node->type = type;
>                 node->refs = 1;
> +               node->tag = 0;
> +               node->file_ptr = 0;
>         }
>         return node;
>  }
>
> -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> +{
> +       kvfree(data->nodes);
> +       data->nodes = NULL;
> +       data->nr = 0;
> +}
> +
> +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> +                             struct io_rsrc_data *data)
>  {
>         if (!data->nr)
>                 return;
> @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
>                 if (data->nodes[data->nr])
>                         io_put_rsrc_node(ctx, data->nodes[data->nr]);
>         }
> -       kvfree(data->nodes);
> -       data->nodes = NULL;
> -       data->nr = 0;
> +       __io_rsrc_data_free(data);
>  }
>
>  __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
>         return -ENOMEM;
>  }
>
> +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> +{
> +       const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> +                                                IO_CACHED_BVECS_SEGS);
> +       int ret;
> +
> +       ret = io_rsrc_data_alloc(&table->data, nr);
> +       if (ret)
> +               return ret;
> +
> +       ret = io_alloc_cache_init(&table->node_cache, nr,
> +                                 sizeof(struct io_rsrc_node), 0);
> +       if (ret)
> +               goto out_1;
> +
> +       ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> +       if (ret)
> +               goto out_2;

io_alloc_cache_init() returns bool. Probably these cases should return
-ENOMEM instead of 1?

> +
> +       return 0;
> +out_2:
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +out_1:
> +       __io_rsrc_data_free(&table->data);
> +       return ret;
> +}
> +
>  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                  struct io_uring_rsrc_update2 *up,
>                                  unsigned nr_args)
> @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
>                                 err = -EBADF;
>                                 break;
>                         }
> -                       node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +                       node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                         if (!node) {
>                                 err = -ENOMEM;
>                                 fput(file);
> @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>                         }
>                         node->tag = tag;
>                 }
> -               i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> +               i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);

Looks like this change belongs in the prior patch "io_uring: add
abstraction for buf_table rsrc data"?

>                 io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
>                 ctx->buf_table.data.nodes[i] = node;
>                 if (ctx->compat)
> @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>         case IORING_RSRC_BUFFER:
>                 if (node->buf)
>                         io_buffer_unmap(ctx, node);
> +               if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> +                       return;
>                 break;
>         default:
>                 WARN_ON_ONCE(1);
> @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>                         goto fail;
>                 }
>                 ret = -ENOMEM;
> -               node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> +               node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
>                 if (!node) {
>                         fput(file);
>                         goto fail;
> @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
>         return ret;
>  }
>
> +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> +                               struct io_buf_table *table)
> +{
> +       io_rsrc_data_free(ctx, &table->data);
> +       io_alloc_cache_free(&table->node_cache, kfree);
> +       io_alloc_cache_free(&table->imu_cache, kfree);
> +}
> +
>  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>  {
>         if (!ctx->buf_table.data.nr)
>                 return -ENXIO;
> -       io_rsrc_data_free(ctx, &ctx->buf_table.data);
> +       io_rsrc_buffer_free(ctx, &ctx->buf_table);
>         return 0;
>  }
>
> @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
>         return true;
>  }
>
> +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> +                                          int nr_bvecs)
> +{
> +       if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> +               return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);

If there is no entry available in the cache, this will heap-allocate
one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
using io_alloc_cache_get() instead of io_cache_alloc(), so the
heap-allocated fallback uses the minimal size.

Also, where are these allocations returned to the imu_cache? Looks
like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
needs to try io_alloc_cache_put() first.

Best,
Caleb

> +       return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> +                       GFP_KERNEL);
> +}
> +
>  static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>                                                    struct iovec *iov,
>                                                    struct page **last_hpage)
> @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>         if (!iov->iov_base)
>                 return NULL;
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node)
>                 return ERR_PTR(-ENOMEM);
>         node->buf = NULL;
> @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>                         coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
>         }
>
> -       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> +       imu = io_alloc_imu(ctx, nr_pages);
>         if (!imu)
>                 goto done;
>
> @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                             unsigned int nr_args, u64 __user *tags)
>  {
>         struct page *last_hpage = NULL;
> -       struct io_rsrc_data data;
>         struct iovec fast_iov, *iov = &fast_iov;
>         const struct iovec __user *uvec;
> +       struct io_buf_table table;
>         int i, ret;
>
>         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                 return -EBUSY;
>         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
>                 return -EINVAL;
> -       ret = io_rsrc_data_alloc(&data, nr_args);
> +       ret = io_rsrc_buffer_alloc(&table, nr_args);
>         if (ret)
>                 return ret;
>
>         if (!arg)
>                 memset(iov, 0, sizeof(*iov));
>
> +       ctx->buf_table = table;
>         for (i = 0; i < nr_args; i++) {
>                 struct io_rsrc_node *node;
>                 u64 tag = 0;
> @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>                         }
>                         node->tag = tag;
>                 }
> -               data.nodes[i] = node;
> +               table.data.nodes[i] = node;
>         }
> -
> -       ctx->buf_table.data = data;

Is it necessary to move this assignment? I found the existing location
easier to reason about, since the assignment of ctx->buf_table
represents a transfer of ownership from the local variable.

>         if (ret)
>                 io_sqe_buffers_unregister(ctx);
>         return ret;
> @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
>                 goto unlock;
>         }
>
> -       node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +       node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>         if (!node) {
>                 ret = -ENOMEM;
>                 goto unlock;
>         }
>
>         nr_bvecs = blk_rq_nr_phys_segments(rq);
> -       imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +
> +       imu = io_alloc_imu(ctx, nr_bvecs);
>         if (!imu) {
>                 kfree(node);
>                 ret = -ENOMEM;
> @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
>  static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
>                             struct io_uring_clone_buffers *arg)
>  {
> -       struct io_rsrc_data data;
> +       struct io_buf_table table;
>         int i, ret, off, nr;
>         unsigned int nbufs;
>
> @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>         if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
>                 return -EOVERFLOW;
>
> -       ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> +       ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
>         if (ret)
>                 return ret;
>
> @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
>
>                 if (src_node) {
> -                       data.nodes[i] = src_node;
> +                       table.data.nodes[i] = src_node;
>                         src_node->refs++;
>                 }
>         }
> @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                 if (!src_node) {
>                         dst_node = NULL;
>                 } else {
> -                       dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +                       dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
>                         if (!dst_node) {
>                                 ret = -ENOMEM;
>                                 goto out_free;
> @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>                         refcount_inc(&src_node->buf->refs);
>                         dst_node->buf = src_node->buf;
>                 }
> -               data.nodes[off++] = dst_node;
> +               table.data.nodes[off++] = dst_node;
>                 i++;
>         }
>
>         /*
> -        * If asked for replace, put the old table. data->nodes[] holds both
> +        * If asked for replace, put the old table. table.data->nodes[] holds both
>          * old and new nodes at this point.
>          */
>         if (arg->flags & IORING_REGISTER_DST_REPLACE)
> @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>          * entry).
>          */
>         WARN_ON_ONCE(ctx->buf_table.data.nr);
> -       ctx->buf_table.data = data;
> +       ctx->buf_table = table;
>         return 0;
>  out_free:
> -       io_rsrc_data_free(ctx, &data);
> +       io_rsrc_buffer_free(ctx, &table);
>         return ret;
>  }
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 2e8d1862caefc..c5bdac558a2b4 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -47,7 +47,7 @@ struct io_imu_folio_data {
>         unsigned int    nr_folios;
>  };
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
>  void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
>  void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
>  int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> --
> 2.43.5
>
>





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux