Re: [PATCH 10/17] io_uring/rw: always setup io_async_rw for read/write requests

Anuj gupta <anuj1072538@xxxxxxxxx> · Mon, 25 Mar 2024 17:33:09 +0530



On Thu, Mar 21, 2024 at 4:28 AM Jens Axboe <axboe@xxxxxxxxx> wrote:
>
> read/write requests try to put everything on the stack, and then alloc
> and copy if we need to retry. This necessitates a bunch of nasty code
> that deals with intermediate state.
>
> Get rid of this, and have the prep side setup everything we need
> upfront, which greatly simplifies the opcode handlers.
>
> This includes adding an alloc cache for io_async_rw, to make it cheap
> to handle.
>
> In terms of cost, this should be basically free and transparent. For
> the worst case of {READ,WRITE}_FIXED which didn't need it before,
> performance is unaffected in the normal peak workload that is being
> used to test that. Still runs at 122M IOPS.
>
> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
> ---
>  include/linux/io_uring_types.h |   1 +
>  io_uring/io_uring.c            |   3 +
>  io_uring/opdef.c               |  15 +-
>  io_uring/rw.c                  | 538 ++++++++++++++++-----------------
>  io_uring/rw.h                  |  19 +-
>  5 files changed, 278 insertions(+), 298 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f37caff64d05..2ba8676f83cc 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -300,6 +300,7 @@ struct io_ring_ctx {
>                 struct io_hash_table    cancel_table_locked;
>                 struct io_alloc_cache   apoll_cache;
>                 struct io_alloc_cache   netmsg_cache;
> +               struct io_alloc_cache   rw_cache;
>
>                 /*
>                  * Any cancelable uring_cmd is added to this list in
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ff0e233ce3c9..cc8ce830ff4b 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -308,6 +308,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>                             sizeof(struct async_poll));
>         io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
>                             sizeof(struct io_async_msghdr));
> +       io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
> +                           sizeof(struct io_async_rw));
>         io_futex_cache_init(ctx);
>         init_completion(&ctx->ref_comp);
>         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
> @@ -2898,6 +2900,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>         io_eventfd_unregister(ctx);
>         io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
>         io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
> +       io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
>         io_futex_cache_free(ctx);
>         io_destroy_buffers(ctx);
>         mutex_unlock(&ctx->uring_lock);
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index dd4a1e1425e1..fcae75a08f2c 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -67,7 +67,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
>                 .vectored               = 1,
> -               .prep                   = io_prep_rwv,
> +               .prep                   = io_prep_readv,
>                 .issue                  = io_read,
>         },
>         [IORING_OP_WRITEV] = {
> @@ -81,7 +81,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
>                 .vectored               = 1,
> -               .prep                   = io_prep_rwv,
> +               .prep                   = io_prep_writev,
>                 .issue                  = io_write,
>         },
>         [IORING_OP_FSYNC] = {
> @@ -99,7 +99,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .ioprio                 = 1,
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
> -               .prep                   = io_prep_rw_fixed,
> +               .prep                   = io_prep_read_fixed,
>                 .issue                  = io_read,
>         },
>         [IORING_OP_WRITE_FIXED] = {
> @@ -112,7 +112,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .ioprio                 = 1,
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
> -               .prep                   = io_prep_rw_fixed,
> +               .prep                   = io_prep_write_fixed,
>                 .issue                  = io_write,
>         },
>         [IORING_OP_POLL_ADD] = {
> @@ -239,7 +239,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .ioprio                 = 1,
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
> -               .prep                   = io_prep_rw,
> +               .prep                   = io_prep_read,
>                 .issue                  = io_read,
>         },
>         [IORING_OP_WRITE] = {
> @@ -252,7 +252,7 @@ const struct io_issue_def io_issue_defs[] = {
>                 .ioprio                 = 1,
>                 .iopoll                 = 1,
>                 .iopoll_queue           = 1,
> -               .prep                   = io_prep_rw,
> +               .prep                   = io_prep_write,
>                 .issue                  = io_write,
>         },
>         [IORING_OP_FADVISE] = {
> @@ -490,14 +490,12 @@ const struct io_cold_def io_cold_defs[] = {
>         [IORING_OP_READV] = {
>                 .async_size             = sizeof(struct io_async_rw),
>                 .name                   = "READV",
> -               .prep_async             = io_readv_prep_async,
>                 .cleanup                = io_readv_writev_cleanup,
>                 .fail                   = io_rw_fail,
>         },
>         [IORING_OP_WRITEV] = {
>                 .async_size             = sizeof(struct io_async_rw),
>                 .name                   = "WRITEV",
> -               .prep_async             = io_writev_prep_async,
>                 .cleanup                = io_readv_writev_cleanup,
>                 .fail                   = io_rw_fail,
>         },
> @@ -699,6 +697,7 @@ const struct io_cold_def io_cold_defs[] = {
>  #endif
>         },
>         [IORING_OP_READ_MULTISHOT] = {
> +               .async_size             = sizeof(struct io_async_rw),
>                 .name                   = "READ_MULTISHOT",
>         },
>         [IORING_OP_WAITID] = {
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 35216e8adc29..583fe61a0acb 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -75,7 +75,153 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
>         return 0;
>  }
>
> -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +static int __io_import_iovec(int ddir, struct io_kiocb *req,
> +                            struct io_async_rw *io,
> +                            unsigned int issue_flags)
> +{
> +       const struct io_issue_def *def = &io_issue_defs[req->opcode];
> +       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +       void __user *buf;
> +       size_t sqe_len;
> +
> +       buf = u64_to_user_ptr(rw->addr);
> +       sqe_len = rw->len;
> +
> +       if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
> +               if (io_do_buffer_select(req)) {
> +                       buf = io_buffer_select(req, &sqe_len, issue_flags);
> +                       if (!buf)
> +                               return -ENOBUFS;
> +                       rw->addr = (unsigned long) buf;
> +                       rw->len = sqe_len;
> +               }
> +
> +               return import_ubuf(ddir, buf, sqe_len, &io->s.iter);
> +       }
> +
> +       io->free_iovec = io->s.fast_iov;
> +       return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
> +                               &io->s.iter, req->ctx->compat);
> +}
> +
> +static inline int io_import_iovec(int rw, struct io_kiocb *req,
> +                                 struct io_async_rw *io,
> +                                 unsigned int issue_flags)
> +{
> +       int ret;
> +
> +       ret = __io_import_iovec(rw, req, io, issue_flags);
> +       if (unlikely(ret < 0))
> +               return ret;
> +
> +       iov_iter_save_state(&io->s.iter, &io->s.iter_state);
> +       return 0;
> +}
> +
> +static void io_rw_iovec_free(struct io_async_rw *rw)
> +{
> +       if (rw->free_iovec) {
> +               kfree(rw->free_iovec);
> +               rw->free_iovec = NULL;
> +       }
> +}
> +
> +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +       struct io_async_rw *rw = req->async_data;
> +
> +       if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
> +               io_rw_iovec_free(rw);
> +               return;
> +       }
> +       if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
> +               req->async_data = NULL;
> +               req->flags &= ~REQ_F_ASYNC_DATA;
> +       }
> +}
> +
> +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +       /*
> +        * Disable quick recycling for anything that's gone through io-wq.
> +        * In theory, this should be fine to cleanup. However, some read or
> +        * write iter handling touches the iovec AFTER having called into the
> +        * handler, eg to reexpand or revert. This means we can have:
> +        *
> +        * task                 io-wq
> +        *   issue
> +        *     punt to io-wq
> +        *                      issue
> +        *                        blkdev_write_iter()
> +        *                          ->ki_complete()
> +        *                            io_complete_rw()
> +        *                              queue tw complete
> +        *  run tw
> +        *    req_rw_cleanup
> +        *                      iov_iter_count() <- look at iov_iter again
> +        *
> +        * which can lead to a UAF. This is only possible for io-wq offload
> +        * as the cleanup can run in parallel. As io-wq is not the fast path,
> +        * just leave cleanup to the end.
> +        *
> +        * This is really a bug in the core code that does this, any issue
> +        * path should assume that a successful (or -EIOCBQUEUED) return can
> +        * mean that the underlying data can be gone at any time. But that
> +        * should be fixed seperately, and then this check could be killed.
> +        */
> +       if (!(req->flags & REQ_F_REFCOUNT)) {
> +               req->flags &= ~REQ_F_NEED_CLEANUP;
> +               io_rw_recycle(req, issue_flags);
> +       }
> +}
> +
> +static int io_rw_alloc_async(struct io_kiocb *req)
> +{
> +       struct io_ring_ctx *ctx = req->ctx;
> +       struct io_cache_entry *entry;
> +       struct io_async_rw *rw;
> +
> +       entry = io_alloc_cache_get(&ctx->rw_cache);
> +       if (entry) {
> +               rw = container_of(entry, struct io_async_rw, cache);
> +               req->flags |= REQ_F_ASYNC_DATA;
> +               req->async_data = rw;
> +               goto done;
> +       }
> +
> +       if (!io_alloc_async_data(req)) {
> +               rw = req->async_data;
> +done:
> +               rw->free_iovec = NULL;
> +               rw->bytes_done = 0;
> +               return 0;
> +       }
> +
> +       return -ENOMEM;
> +}
> +
> +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
> +{
> +       struct io_async_rw *rw;
> +       int ret;
> +
> +       if (io_rw_alloc_async(req))
> +               return -ENOMEM;
> +
> +       if (!do_import || io_do_buffer_select(req))
> +               return 0;
> +
> +       rw = req->async_data;
> +       ret = io_import_iovec(ddir, req, rw, 0);
> +       if (unlikely(ret < 0))
> +               return ret;
> +
> +       iov_iter_save_state(&rw->s.iter, &rw->s.iter_state);

It seems that the state of iov_iter gets saved in the caller io_import_iovec
as well. Do we need to save it again here?
--
Anuj Gupta