[PATCH 09/16] io_uring: optimise fill_event() by inlining

Pavel Begunkov <asml.silence@xxxxxxxxx> · Sun, 11 Apr 2021 01:46:33 +0100

There are three cases where we much care about performance of
io_cqring_fill_event() -- flushing inline completions, iopoll and
io_req_complete_post(). Inline a hot part of fill_event() into them.

All others are not as important and we don't want to bloat binary for
them, so add a noinline version of the function for all other use
use cases.

nops test(batch=32): 16.932 vs 17.822 KIOPS

Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx>
---
 fs/io_uring.c | 57 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1a7bfb10d2b2..a8d6ea1ecd2d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1338,7 +1338,7 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 }
 
-static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned tail;
@@ -1494,26 +1494,11 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static bool io_cqring_fill_event(struct io_kiocb *req, long res,
-				 unsigned int cflags)
+static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
+				     unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(ctx, req->user_data, res, cflags);
-
-	/*
-	 * If we can't get a cq entry, userspace overflowed the
-	 * submission (by quite a lot). Increment the overflow count in
-	 * the ring.
-	 */
-	cqe = io_get_cqring(ctx);
-	if (likely(cqe)) {
-		WRITE_ONCE(cqe->user_data, req->user_data);
-		WRITE_ONCE(cqe->res, res);
-		WRITE_ONCE(cqe->flags, cflags);
-		return true;
-	}
 	if (!atomic_read(&req->task->io_uring->in_idle)) {
 		struct io_overflow_cqe *ocqe;
 
@@ -1541,6 +1526,36 @@ static bool io_cqring_fill_event(struct io_kiocb *req, long res,
 	return false;
 }
 
+static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
+					     unsigned int cflags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_uring_cqe *cqe;
+
+	trace_io_uring_complete(ctx, req->user_data, res, cflags);
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqring(ctx);
+	if (likely(cqe)) {
+		WRITE_ONCE(cqe->user_data, req->user_data);
+		WRITE_ONCE(cqe->res, res);
+		WRITE_ONCE(cqe->flags, cflags);
+		return true;
+	}
+	return io_cqring_event_overflow(req, res, cflags);
+}
+
+/* not as hot to bloat with inlining */
+static noinline bool io_cqring_fill_event(struct io_kiocb *req, long res,
+					  unsigned int cflags)
+{
+	return __io_cqring_fill_event(req, res, cflags);
+}
+
 static void io_req_complete_post(struct io_kiocb *req, long res,
 				 unsigned int cflags)
 {
@@ -1548,7 +1563,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(req, res, cflags);
+	__io_cqring_fill_event(req, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2103,7 +2118,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
 		req = cs->reqs[i];
-		io_cqring_fill_event(req, req->result, req->compl.cflags);
+		__io_cqring_fill_event(req, req->result, req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -2243,7 +2258,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_rw_kbuf(req);
 
-		io_cqring_fill_event(req, req->result, cflags);
+		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
-- 
2.24.0