On 5/14/20 8:22 AM, Jens Axboe wrote: >> I still use my previous io_uring_nop_stress tool to evaluate the improvement >> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz". >> Before this patch: >> $sudo taskset -c 60 ./io_uring_nop_stress -r 300 >> total ios: 1608773840 >> IOPS: 5362579 >> >> With this patch: >> sudo taskset -c 60 ./io_uring_nop_stress -r 300 >> total ios: 1676910736 >> IOPS: 5589702 >> About 4.2% improvement. > > That's not bad. Can you try the patch from Pekka as well, just to see if > that helps for you? > > I also had another idea... We basically have two types of request life > times: > > 1) io_kiocb can get queued up internally > 2) io_kiocb completes inline > > For the latter, it's totally feasible to just have the io_kiocb on > stack. The downside is if we need to go the slower path, then we need to > alloc an io_kiocb then and copy it. But maybe that's OK... I'll play > with it. Can you try this with your microbenchmark? Just curious what it looks like for that test case if we completely take slab alloc+free out of it. diff --git a/fs/io_uring.c b/fs/io_uring.c index d2e37215d05a..4ecd6bd38f02 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -525,6 +525,7 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, + REQ_F_STACK_REQ_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -580,6 +581,8 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), + /* on-stack req */ + REQ_F_STACK_REQ = BIT(REQ_F_STACK_REQ_BIT), }; struct async_poll { @@ -695,10 +698,14 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + /* op can use stack req */ + unsigned stack_req : 1; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .stack_req = 1, + }, [IORING_OP_READV] = { .async_ctx = 1, .needs_mm = 1, @@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req) if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); if (req->task) @@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req) } percpu_ref_put(&req->ctx->refs); + if (req->flags & REQ_F_STACK_REQ) + return; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -5784,12 +5794,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, * link list. */ req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; - req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; req->file = NULL; req->ctx = ctx; - req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = NULL; @@ -5839,6 +5847,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb stack_req; int i, submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ @@ -5865,20 +5874,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - int err; + int err, op; sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); - if (unlikely(!req)) { - if (!submitted) - submitted = -EAGAIN; - break; + + op = READ_ONCE(sqe->opcode); + + if (io_op_defs[op].stack_req) { + req = &stack_req; + req->flags = REQ_F_STACK_REQ; + } else { + req = io_alloc_req(ctx, statep); + if (unlikely(!req)) { + if (!submitted) + submitted = -EAGAIN; + break; + } + req->flags = 0; } + req->opcode = op; + err = io_init_req(ctx, req, sqe, statep, async); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ -- Jens Axboe