[PATCH RFC v2 17/19] fuse: {uring} Send async requests to qid of core + 1

Bernd Schubert <bschubert@xxxxxxx> · Wed, 29 May 2024 20:00:52 +0200

This is another performance optimization - async requests are
better served on another core.
Async blocking requests are marked as such and treated as sync requests.

Example with mmap read
fio --size=1G --numjobs=32 --ioengine=mmap --output-format=normal,terse\
--directory=/scratch/dest/ --rw=read --bs=4K --group_reporting \
job-file.fio

jobs  /dev/fuse	uring   gain    uring   gain      gain
                               (core+1)
                      (to dev)         (to dev)  (uring same-core)
1	   124.61   306.59  2.46    255.51   2.05     0.83
2	   248.83   580.00  2.33    563.00   2.26     0.97
4	   611.47  1049.65  1.72    998.57   1.63     0.95
8	  1499.95  1848.42  1.23   1990.64   1.33     1.08
16	  2206.30  2890.24  1.31   3439.13   1.56     1.19
24	  2545.68  2704.87  1.06   4527.63   1.78     1.67
32	  2233.52  2574.37  1.15   5263.09   2.36     2.04

Interesting here is that the max gain comes with more core usage,
I had actually expected the other way around.

Signed-off-by: Bernd Schubert <bschubert@xxxxxxx>
---
 fs/fuse/dev_uring.c | 5 ++++-
 fs/fuse/file.c      | 1 +
 fs/fuse/fuse_i.h    | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index fe80e66150c3..dff210658172 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -1106,6 +1106,8 @@ int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req)
 	struct list_head *req_queue, *ent_queue;
 
 	if (ring->per_core_queue) {
+		int cpu_off;
+
 		/*
 		 * async requests are best handled on another core, the current
 		 * core can do application/page handling, while the async request
@@ -1118,7 +1120,8 @@ int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req)
 		 * It should also not persistently switch between cores - makes
 		 * it hard for the scheduler.
 		 */
-		qid = task_cpu(current);
+		cpu_off = async ? 1 : 0;
+		qid = (task_cpu(current) + cpu_off) % ring->nr_queues;
 
 		if (unlikely(qid >= ring->nr_queues)) {
 			WARN_ONCE(1,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b57ce4157640..6fda1e7bd7f4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -791,6 +791,7 @@ static ssize_t fuse_async_req_send(struct fuse_mount *fm,
 
 	ia->ap.args.end = fuse_aio_complete_req;
 	ia->ap.args.may_block = io->should_dirty;
+	ia->ap.args.async_blocking = io->blocking;
 	err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
 	if (err)
 		fuse_aio_complete_req(fm, &ia->ap.args, err);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fadc51a22bb9..7dcf0472df67 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -309,6 +309,7 @@ struct fuse_args {
 	bool may_block:1;
 	bool is_ext:1;
 	bool is_pinned:1;
+	bool async_blocking : 1;
 	struct fuse_in_arg in_args[3];
 	struct fuse_arg out_args[2];
 	void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);

-- 
2.40.1