[PATCH RFC v2 18/19] fuse: {uring} Set a min cpu offset io-size for reads/writes

Bernd Schubert <bschubert@xxxxxxx> · Wed, 29 May 2024 20:00:53 +0200

This is another optimization - async switches between cores
(as of now uses core + 1) to send IO, but using another
core also means overhead - set a minimal IO size for that.

Signed-off-by: Bernd Schubert <bschubert@xxxxxxx>

---
I didn't annotate exact benchmark data, but can extract it
(but needs verification)

jobs	/dev/fuse	   uring      uring           uring
                    (same core)  (core + 1) (conditional core + 1)
1         127598      313944      261641         330445
2         254806      593925      576516         551392
4         626144     1074837     1022533        1065389
8         1535953    1892787     2038420        2087627
16        2259253    2959607     3521665        3602580
24        2606776    2769790     4636297        4670717
32        2287126    2636150     5389404        5763385

I.e. this is mostly to compensate for slight degradation
with core + 1 for small requests with few cores.
---
 fs/fuse/dev_uring.c   | 69 +++++++++++++++++++++++++++++++++++++--------------
 fs/fuse/dev_uring_i.h |  7 ++++++
 fs/fuse/file.c        | 14 ++++++++++-
 3 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index dff210658172..cdc5836edb6e 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1095,18 +1095,33 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	goto out;
 }
 
-int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req)
+static int fuse_uring_get_req_qid(struct fuse_req *req, struct fuse_ring *ring,
+				  bool async)
 {
-	struct fuse_ring *ring = fc->ring;
-	struct fuse_ring_queue *queue;
-	int qid = 0;
-	struct fuse_ring_ent *ring_ent = NULL;
-	int res;
-	bool async = test_bit(FR_BACKGROUND, &req->flags);
-	struct list_head *req_queue, *ent_queue;
+	int cpu_off = 0;
+	size_t req_size = 0;
+	int qid;
 
-	if (ring->per_core_queue) {
-		int cpu_off;
+	if (!ring->per_core_queue)
+		return 0;
+
+	/*
+	 * async has on a different core (see below) introduces context
+	 * switching - should be avoided for small requests
+	 */
+	if (async) {
+		switch (req->args->opcode) {
+		case FUSE_READ:
+			req_size = req->args->out_args[0].size;
+			break;
+		case FUSE_WRITE:
+			req_size = req->args->in_args[1].size;
+			break;
+		default:
+			/* anything else, <= 4K */
+			req_size = 0;
+			break;
+		}
 
 		/*
 		 * async requests are best handled on another core, the current
@@ -1120,17 +1135,33 @@ int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req)
 		 * It should also not persistently switch between cores - makes
 		 * it hard for the scheduler.
 		 */
-		cpu_off = async ? 1 : 0;
-		qid = (task_cpu(current) + cpu_off) % ring->nr_queues;
-
-		if (unlikely(qid >= ring->nr_queues)) {
-			WARN_ONCE(1,
-				  "Core number (%u) exceeds nr ueues (%zu)\n",
-				  qid, ring->nr_queues);
-			qid = 0;
-		}
+		if (req_size > FUSE_URING_MIN_ASYNC_SIZE)
+			cpu_off = 1;
 	}
 
+	qid = (task_cpu(current) + cpu_off) % ring->nr_queues;
+
+	if (unlikely(qid >= ring->nr_queues)) {
+		WARN_ONCE(1, "Core number (%u) exceeds nr queues (%zu)\n",
+			  qid, ring->nr_queues);
+		qid = 0;
+	}
+
+	return qid;
+}
+
+int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_ring *ring = fc->ring;
+	struct fuse_ring_queue *queue;
+	struct fuse_ring_ent *ring_ent = NULL;
+	int res;
+	int async = test_bit(FR_BACKGROUND, &req->flags) &&
+		    !req->args->async_blocking;
+	struct list_head *ent_queue, *req_queue;
+	int qid;
+
+	qid = fuse_uring_get_req_qid(req, ring, async);
 	queue = fuse_uring_get_queue(ring, qid);
 	req_queue = async ? &queue->async_fuse_req_queue :
 			    &queue->sync_fuse_req_queue;
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 5d7e1e6e7a82..0b201becdf5a 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -11,6 +11,13 @@
 #include "linux/compiler_types.h"
 #include "linux/rbtree_types.h"
 
+/**
+ * Minimal async size with uring communication. Async is handled on a different
+ * core and that has overhead, so the async queue is only used beginning
+ * with a certain size - XXX should this be a tunable parameter?
+ */
+#define FUSE_URING_MIN_ASYNC_SIZE (16384)
+
 #if IS_ENABLED(CONFIG_FUSE_IO_URING)
 
 /* IORING_MAX_ENTRIES */
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6fda1e7bd7f4..4fc742bf0588 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -7,6 +7,7 @@
 */
 
 #include "fuse_i.h"
+#include "dev_uring_i.h"
 
 #include <linux/pagemap.h>
 #include <linux/slab.h>
@@ -955,11 +956,22 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_mount *fm = ff->fm;
+	struct fuse_conn *fc = fm->fc;
 	struct fuse_args_pages *ap = &ia->ap;
 	loff_t pos = page_offset(ap->pages[0]);
 	size_t count = ap->num_pages << PAGE_SHIFT;
 	ssize_t res;
 	int err;
+	unsigned int async = fc->async_read;
+
+	/*
+	 * sync request stay longer on the same core - important with uring
+	 * Check here and not only in dev_uring.c as we have control in
+	 * fuse_simple_request if it should wake up on the same core,
+	 * avoids application core switching
+	 */
+	if (async && fuse_uring_ready(fc) && count <= FUSE_URING_MIN_ASYNC_SIZE)
+		async = 0;
 
 	ap->args.out_pages = true;
 	ap->args.page_zeroing = true;
@@ -974,7 +986,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 
 	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 	ia->read.attr_ver = fuse_get_attr_version(fm->fc);
-	if (fm->fc->async_read) {
+	if (async) {
 		ia->ff = fuse_file_get(ff);
 		ap->args.end = fuse_readpages_end;
 		err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);

-- 
2.40.1