This is another optimization - async switches between cores (as of now uses core + 1) to send IO, but using another core also means overhead - set a minimal IO size for that. Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> --- I didn't annotate exact benchmark data, but can extract it (but needs verification) jobs /dev/fuse uring uring uring (same core) (core + 1) (conditional core + 1) 1 127598 313944 261641 330445 2 254806 593925 576516 551392 4 626144 1074837 1022533 1065389 8 1535953 1892787 2038420 2087627 16 2259253 2959607 3521665 3602580 24 2606776 2769790 4636297 4670717 32 2287126 2636150 5389404 5763385 I.e. this is mostly to compensate for slight degradation with core + 1 for small requests with few cores. --- fs/fuse/dev_uring.c | 69 +++++++++++++++++++++++++++++++++++++-------------- fs/fuse/dev_uring_i.h | 7 ++++++ fs/fuse/file.c | 14 ++++++++++- 3 files changed, 70 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index dff210658172..cdc5836edb6e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1095,18 +1095,33 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) goto out; } -int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req) +static int fuse_uring_get_req_qid(struct fuse_req *req, struct fuse_ring *ring, + bool async) { - struct fuse_ring *ring = fc->ring; - struct fuse_ring_queue *queue; - int qid = 0; - struct fuse_ring_ent *ring_ent = NULL; - int res; - bool async = test_bit(FR_BACKGROUND, &req->flags); - struct list_head *req_queue, *ent_queue; + int cpu_off = 0; + size_t req_size = 0; + int qid; - if (ring->per_core_queue) { - int cpu_off; + if (!ring->per_core_queue) + return 0; + + /* + * async has on a different core (see below) introduces context + * switching - should be avoided for small requests + */ + if (async) { + switch (req->args->opcode) { + case FUSE_READ: + req_size = req->args->out_args[0].size; + break; + case FUSE_WRITE: + req_size = req->args->in_args[1].size; + break; + default: + /* anything else, <= 4K */ + req_size = 0; + break; + } /* * async requests are best handled on another core, the current @@ -1120,17 +1135,33 @@ int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req) * It should also not persistently switch between cores - makes * it hard for the scheduler. */ - cpu_off = async ? 1 : 0; - qid = (task_cpu(current) + cpu_off) % ring->nr_queues; - - if (unlikely(qid >= ring->nr_queues)) { - WARN_ONCE(1, - "Core number (%u) exceeds nr ueues (%zu)\n", - qid, ring->nr_queues); - qid = 0; - } + if (req_size > FUSE_URING_MIN_ASYNC_SIZE) + cpu_off = 1; } + qid = (task_cpu(current) + cpu_off) % ring->nr_queues; + + if (unlikely(qid >= ring->nr_queues)) { + WARN_ONCE(1, "Core number (%u) exceeds nr queues (%zu)\n", + qid, ring->nr_queues); + qid = 0; + } + + return qid; +} + +int fuse_uring_queue_fuse_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ring_ent = NULL; + int res; + int async = test_bit(FR_BACKGROUND, &req->flags) && + !req->args->async_blocking; + struct list_head *ent_queue, *req_queue; + int qid; + + qid = fuse_uring_get_req_qid(req, ring, async); queue = fuse_uring_get_queue(ring, qid); req_queue = async ? &queue->async_fuse_req_queue : &queue->sync_fuse_req_queue; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 5d7e1e6e7a82..0b201becdf5a 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -11,6 +11,13 @@ #include "linux/compiler_types.h" #include "linux/rbtree_types.h" +/** + * Minimal async size with uring communication. Async is handled on a different + * core and that has overhead, so the async queue is only used beginning + * with a certain size - XXX should this be a tunable parameter? + */ +#define FUSE_URING_MIN_ASYNC_SIZE (16384) + #if IS_ENABLED(CONFIG_FUSE_IO_URING) /* IORING_MAX_ENTRIES */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6fda1e7bd7f4..4fc742bf0588 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" #include <linux/pagemap.h> #include <linux/slab.h> @@ -955,11 +956,22 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; + struct fuse_conn *fc = fm->fc; struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; ssize_t res; int err; + unsigned int async = fc->async_read; + + /* + * sync request stay longer on the same core - important with uring + * Check here and not only in dev_uring.c as we have control in + * fuse_simple_request if it should wake up on the same core, + * avoids application core switching + */ + if (async && fuse_uring_ready(fc) && count <= FUSE_URING_MIN_ASYNC_SIZE) + async = 0; ap->args.out_pages = true; ap->args.page_zeroing = true; @@ -974,7 +986,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); -- 2.40.1