Each put_task_struct() is an atomic_dec. Do that in batches. Tested io_uring-bench(iopoll,QD=128) with a custom nullblk, where added ->iopoll() is not optimised at all: before: 529504 IOPS after: 538415 IOPS diff: ~1.8% Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- fs/io_uring.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f767781351f..3216cc00061b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1761,8 +1761,18 @@ static void io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; + + struct task_struct *task; + int task_refs; }; +static void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, struct req_batch *rb) { @@ -1776,6 +1786,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->to_free) __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) @@ -1787,6 +1801,16 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task && rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) @@ -1809,7 +1833,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - rb.to_free = 0; + io_init_req_batch(&rb); for (i = 0; i < nr; ++i) { req = cs->reqs[i]; if (refcount_dec_and_test(&req->refs)) @@ -1973,7 +1997,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; -- 2.24.0