__io_submit_sqe() checks the result of a request for -EAGAIN in polled mode, without ensuring first that the request has completed. The request will be immediately resubmitted in io_sq_wq_submit_work(), potentially before the the first submission has completed. This creates a race where the original completion may set REQ_F_IOPOLL_COMPLETED in a freed submission request. Move a submitted request to the poll list unconditionally in polled mode. The request can then be retried if necessary once the original submission has indeed completed. Signed-off-by: <bijan.mottahedeh@xxxxxxxxxx> Reviewed-by: Dan Carpenter <dan.carpenter@xxxxxxxxxx> --- fs/io_uring.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index acb213c..9e2eef5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -328,6 +328,7 @@ struct io_kiocb { #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ +#define REQ_F_SQE_ALLOC 8192 /* dynamically allocated sqe */ u64 user_data; u32 result; u32 sequence; @@ -657,6 +658,16 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { + int i; + struct io_kiocb *req; + + for (i = 0; i < *nr; i++) { + req = reqs[i]; + if (req->flags & REQ_F_SQE_ALLOC) { + kfree(req->submit.sqe); + req->submit.sqe = NULL; + } + } kmem_cache_free_bulk(req_cachep, *nr, reqs); percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; @@ -668,6 +679,10 @@ static void __io_free_req(struct io_kiocb *req) if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); percpu_ref_put(&req->ctx->refs); + if (req->flags & REQ_F_SQE_ALLOC) { + kfree(req->submit.sqe); + req->submit.sqe = NULL; + } kmem_cache_free(req_cachep, req); } @@ -789,6 +804,29 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); + /* + * If a retry is needed, reset the request and queue + * it for async submission. + */ + if (req->result == -EAGAIN) { + int ret; + + refcount_set(&req->refs, 2); + req->flags &= ~REQ_F_IOPOLL_COMPLETED; + req->result = 0; + + ret = io_queue_async(ctx, req, &req->submit); + if (!ret) + continue; + + /* + * The async submission failed. Mark the + * request as complete. + */ + refcount_set(&req->refs, 1); + req->flags |= REQ_F_IOPOLL_COMPLETED; + } + io_cqring_fill_event(ctx, req->user_data, req->result); (*nr_events)++; @@ -835,9 +873,14 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * Move completed entries to our local list. If we find a * request that requires polling, break out and complete * the done list first, if we have entries there. + * Make sure the completed entries have been properly + * reference counted before moving them. This accounts + * for a race where a request can complete before its + * reference count is decremented after the submission. */ if (req->flags & REQ_F_IOPOLL_COMPLETED) { - list_move_tail(&req->list, &done); + if (refcount_read(&req->refs) < 2) + list_move_tail(&req->list, &done); continue; } if (!list_empty(&done)) @@ -1007,8 +1050,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; req->result = res; - if (res != -EAGAIN) - req->flags |= REQ_F_IOPOLL_COMPLETED; + req->flags |= REQ_F_IOPOLL_COMPLETED; } /* @@ -2116,6 +2158,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); req->submit.sqe = sqe_copy; + req->flags |= REQ_F_SQE_ALLOC; INIT_WORK(&req->work, io_sq_wq_submit_work); trace_io_uring_defer(ctx, req, false); @@ -2189,9 +2232,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { - if (req->result == -EAGAIN) - return -EAGAIN; - + /* + * Add the request to the poll list unconditionally. + * The request may be resubmitted in case of EAGAIN + * during completion processing. + */ + memcpy(&req->submit, s, sizeof(*s)); /* workqueue context doesn't hold uring_lock, grab it now */ if (s->in_async) mutex_lock(&ctx->uring_lock); @@ -2284,9 +2330,6 @@ static void io_sq_wq_submit_work(struct work_struct *work) io_put_req(req, NULL); } - /* async context always use a copy of the sqe */ - kfree(sqe); - /* if a dependent link is ready, do that as the next one */ if (!ret && nxt) { req = nxt; @@ -2451,6 +2494,7 @@ static int io_queue_async(struct io_ring_ctx *ctx, struct io_kiocb *req, return -ENOMEM; s->sqe = sqe_copy; + req->flags |= REQ_F_SQE_ALLOC; memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); @@ -2607,6 +2651,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, } s->sqe = sqe_copy; + req->flags |= REQ_F_SQE_ALLOC; memcpy(&req->submit, s, sizeof(*s)); trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); -- 1.8.3.1