This only adds the initial ioctl for basic fuse-uring initialization. More ioctl types will be added later to initialize queues. This also adds data structures needed or initialized by the ioctl command and that will be used later. Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> --- fs/fuse/Kconfig | 12 +++ fs/fuse/Makefile | 1 + fs/fuse/dev.c | 91 ++++++++++++++++-- fs/fuse/dev_uring.c | 122 +++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 239 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_dev_i.h | 1 + fs/fuse/fuse_i.h | 5 + fs/fuse/inode.c | 3 + include/uapi/linux/fuse.h | 73 ++++++++++++++ 9 files changed, 538 insertions(+), 9 deletions(-) diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 8674dbfbe59d..11f37cefc94b 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH to be performed directly on a backing file. If you want to allow passthrough operations, answer Y. + +config FUSE_IO_URING + bool "FUSE communication over io-uring" + default y + depends on FUSE_FS + depends on IO_URING + help + This allows sending FUSE requests over the IO uring interface and + also adds request core affinity. + + If you want to allow fuse server/client communication through io-uring, + answer Y diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6e0228c6d0cb..7193a14374fd 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,5 +11,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b98ecb197a28..bc77413932cf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -8,6 +8,7 @@ #include "fuse_i.h" #include "fuse_dev_i.h" +#include "dev_uring_i.h" #include <linux/init.h> #include <linux/module.h> @@ -26,6 +27,13 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); +#if IS_ENABLED(CONFIG_FUSE_IO_URING) +static bool __read_mostly enable_uring; +module_param(enable_uring, bool, 0644); +MODULE_PARM_DESC(enable_uring, + "Enable uring userspace communication through uring."); +#endif + static struct kmem_cache *fuse_req_cachep; static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) @@ -2297,16 +2305,12 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) +static long _fuse_dev_ioctl_clone(struct file *file, int oldfd) { int res; - int oldfd; struct fuse_dev *fud = NULL; struct fd f; - if (get_user(oldfd, argp)) - return -EFAULT; - f = fdget(oldfd); if (!f.file) return -EINVAL; @@ -2329,6 +2333,16 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) return res; } +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) +{ + int oldfd; + + if (get_user(oldfd, argp)) + return -EFAULT; + + return _fuse_dev_ioctl_clone(file, oldfd); +} + static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_backing_map __user *argp) { @@ -2364,8 +2378,65 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +/** + * Configure the queue for the given qid. First call will also initialize + * the ring for this connection. + */ +static long fuse_uring_ioctl(struct file *file, __u32 __user *argp) +{ +#if IS_ENABLED(CONFIG_FUSE_IO_URING) + int res; + struct fuse_uring_cfg cfg; + struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_ring *ring; + + res = copy_from_user(&cfg, (void *)argp, sizeof(cfg)); + if (res != 0) + return -EFAULT; + + fud = fuse_get_dev(file); + if (fud == NULL) + return -ENODEV; + fc = fud->fc; + + switch (cfg.cmd) { + case FUSE_URING_IOCTL_CMD_RING_CFG: + if (READ_ONCE(fc->ring) == NULL) + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL); + + spin_lock(&fc->lock); + if (fc->ring == NULL) { + fc->ring = ring; + fuse_uring_conn_init(fc->ring, fc); + } else { + kfree(ring); + } + + spin_unlock(&fc->lock); + if (fc->ring == NULL) + return -ENOMEM; + + mutex_lock(&fc->ring->start_stop_lock); + res = fuse_uring_conn_cfg(fc->ring, &cfg.rconf); + mutex_unlock(&fc->ring->start_stop_lock); + + if (res != 0) + return res; + break; + default: + res = -EINVAL; + } + + return res; +#else + return -ENOTTY; +#endif +} + +static long +fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { void __user *argp = (void __user *)arg; @@ -2379,8 +2450,10 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); - default: - return -ENOTTY; + case FUSE_DEV_IOC_URING: + return fuse_uring_ioctl(file, argp); + + default: return -ENOTTY; } } diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c new file mode 100644 index 000000000000..702a994cf192 --- /dev/null +++ b/fs/fuse/dev_uring.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#include "fuse_i.h" +#include "fuse_dev_i.h" +#include "dev_uring_i.h" + +#include "linux/compiler_types.h" +#include "linux/spinlock.h" +#include <linux/init.h> +#include <linux/module.h> +#include <linux/poll.h> +#include <linux/sched/signal.h> +#include <linux/uio.h> +#include <linux/miscdevice.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/pipe_fs_i.h> +#include <linux/swap.h> +#include <linux/splice.h> +#include <linux/sched.h> +#include <linux/io_uring.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> +#include <linux/topology.h> +#include <linux/io_uring/cmd.h> + +/* + * Basic ring setup for this connection based on the provided configuration + */ +int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg) +{ + size_t queue_sz; + + if (ring->configured) { + pr_info("The ring is already configured.\n"); + return -EALREADY; + } + + if (rcfg->nr_queues == 0) { + pr_info("zero number of queues is invalid.\n"); + return -EINVAL; + } + + if (rcfg->nr_queues > 1 && rcfg->nr_queues != num_present_cpus()) { + pr_info("nr-queues (%d) does not match nr-cores (%d).\n", + rcfg->nr_queues, num_present_cpus()); + return -EINVAL; + } + + if (rcfg->req_arg_len < FUSE_RING_MIN_IN_OUT_ARG_SIZE) { + pr_info("Per req buffer size too small (%d), min: %d\n", + rcfg->req_arg_len, FUSE_RING_MIN_IN_OUT_ARG_SIZE); + return -EINVAL; + } + + if (WARN_ON(ring->queues)) + return -EINVAL; + + ring->numa_aware = rcfg->numa_aware; + ring->nr_queues = rcfg->nr_queues; + ring->per_core_queue = rcfg->nr_queues > 1; + + ring->max_nr_sync = rcfg->sync_queue_depth; + ring->max_nr_async = rcfg->async_queue_depth; + ring->queue_depth = ring->max_nr_sync + ring->max_nr_async; + + ring->req_arg_len = rcfg->req_arg_len; + ring->req_buf_sz = rcfg->user_req_buf_sz; + + ring->queue_buf_size = ring->req_buf_sz * ring->queue_depth; + + queue_sz = sizeof(*ring->queues) + + ring->queue_depth * sizeof(struct fuse_ring_ent); + ring->queues = kcalloc(rcfg->nr_queues, queue_sz, GFP_KERNEL); + if (!ring->queues) + return -ENOMEM; + ring->queue_size = queue_sz; + ring->configured = 1; + + atomic_set(&ring->queue_refs, 0); + + return 0; +} + +void fuse_uring_ring_destruct(struct fuse_ring *ring) +{ + unsigned int qid; + struct rb_node *rbn; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = fuse_uring_get_queue(ring, qid); + + vfree(queue->queue_req_buf); + } + + kfree(ring->queues); + ring->queues = NULL; + ring->nr_queues_ioctl_init = 0; + ring->queue_depth = 0; + ring->nr_queues = 0; + + rbn = rb_first(&ring->mem_buf_map); + while (rbn) { + struct rb_node *next = rb_next(rbn); + struct fuse_uring_mbuf *entry = + rb_entry(rbn, struct fuse_uring_mbuf, rb_node); + + rb_erase(rbn, &ring->mem_buf_map); + kfree(entry); + + rbn = next; + } + + mutex_destroy(&ring->start_stop_lock); +} diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h new file mode 100644 index 000000000000..58ab4671deff --- /dev/null +++ b/fs/fuse/dev_uring_i.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#ifndef _FS_FUSE_DEV_URING_I_H +#define _FS_FUSE_DEV_URING_I_H + +#include "fuse_i.h" +#include "linux/compiler_types.h" +#include "linux/rbtree_types.h" + +#if IS_ENABLED(CONFIG_FUSE_IO_URING) + +/* IORING_MAX_ENTRIES */ +#define FUSE_URING_MAX_QUEUE_DEPTH 32768 + +struct fuse_uring_mbuf { + struct rb_node rb_node; + void *kbuf; /* kernel allocated ring request buffer */ + void *ubuf; /* mmaped address */ +}; + +/** A fuse ring entry, part of the ring queue */ +struct fuse_ring_ent { + /* + * pointer to kernel request buffer, userspace side has direct access + * to it through the mmaped buffer + */ + struct fuse_ring_req *rreq; + + /* the ring queue that owns the request */ + struct fuse_ring_queue *queue; + + struct io_uring_cmd *cmd; + + struct list_head list; + + /* + * state the request is currently in + * (enum fuse_ring_req_state) + */ + unsigned long state; + + /* array index in the ring-queue */ + int tag; + + /* is this an async or sync entry */ + unsigned int async : 1; + + struct fuse_req *fuse_req; /* when a list request is handled */ +}; + +struct fuse_ring_queue { + /* task belonging to the current queue */ + struct task_struct *server_task; + + /* + * back pointer to the main fuse uring structure that holds this + * queue + */ + struct fuse_ring *ring; + + /* issue flags when running in io-uring task context */ + unsigned int uring_cmd_issue_flags; + + int qid; + + /* + * available number of sync requests, + * loosely bound to fuse foreground requests + */ + int nr_req_sync; + + /* + * available number of async requests + * loosely bound to fuse background requests + */ + int nr_req_async; + + /* queue lock, taken when any value in the queue changes _and_ also + * a ring entry state changes. + */ + spinlock_t lock; + + /* per queue memory buffer that is divided per request */ + char *queue_req_buf; + + /* fuse fg/bg request types */ + struct list_head async_fuse_req_queue; + struct list_head sync_fuse_req_queue; + + /* available ring entries (struct fuse_ring_ent) */ + struct list_head async_ent_avail_queue; + struct list_head sync_ent_avail_queue; + + struct list_head ent_in_userspace; + + unsigned int configured : 1; + unsigned int stopped : 1; + + /* size depends on queue depth */ + struct fuse_ring_ent ring_ent[] ____cacheline_aligned_in_smp; +}; + +/** + * Describes if uring is for communication and holds alls the data needed + * for uring communication + */ +struct fuse_ring { + /* back pointer to fuse_conn */ + struct fuse_conn *fc; + + /* number of ring queues */ + size_t nr_queues; + + /* number of entries per queue */ + size_t queue_depth; + + /* max arg size for a request */ + size_t req_arg_len; + + /* req_arg_len + sizeof(struct fuse_req) */ + size_t req_buf_sz; + + /* max number of background requests per queue */ + size_t max_nr_async; + + /* max number of foreground requests */ + size_t max_nr_sync; + + /* size of struct fuse_ring_queue + queue-depth * entry-size */ + size_t queue_size; + + /* buffer size per queue, that is used per queue entry */ + size_t queue_buf_size; + + /* Used to release the ring on stop */ + atomic_t queue_refs; + + /* Hold ring requests */ + struct fuse_ring_queue *queues; + + /* number of initialized queues with the ioctl */ + int nr_queues_ioctl_init; + + /* number of SQEs initialized */ + atomic_t nr_sqe_init; + + /* one queue per core or a single queue only ? */ + unsigned int per_core_queue : 1; + + /* Is the ring completely iocl configured */ + unsigned int configured : 1; + + /* numa aware memory allocation */ + unsigned int numa_aware : 1; + + /* Is the ring read to take requests */ + unsigned int ready : 1; + + /* + * Log ring entry states onces on stop when entries cannot be + * released + */ + unsigned int stop_debug_log : 1; + + struct mutex start_stop_lock; + + wait_queue_head_t stop_waitq; + + /* mmaped ring entry memory buffers, mmaped values is the key, + * kernel pointer is the value + */ + struct rb_root mem_buf_map; + + struct delayed_work stop_work; + unsigned long stop_time; +}; + +void fuse_uring_abort_end_requests(struct fuse_ring *ring); +int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg); +int fuse_uring_queue_cfg(struct fuse_ring *ring, + struct fuse_ring_queue_config *qcfg); +void fuse_uring_ring_destruct(struct fuse_ring *ring); + +static inline void fuse_uring_conn_init(struct fuse_ring *ring, + struct fuse_conn *fc) +{ + /* no reference on fc as ring and fc have to be destructed together */ + ring->fc = fc; + init_waitqueue_head(&ring->stop_waitq); + mutex_init(&ring->start_stop_lock); + ring->mem_buf_map = RB_ROOT; +} + +static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + fuse_uring_ring_destruct(ring); + + WRITE_ONCE(fc->ring, NULL); + kfree(ring); +} + +static inline struct fuse_ring_queue * +fuse_uring_get_queue(struct fuse_ring *ring, int qid) +{ + char *ptr = (char *)ring->queues; + + if (unlikely(qid > ring->nr_queues)) { + WARN_ON(1); + qid = 0; + } + + return (struct fuse_ring_queue *)(ptr + qid * ring->queue_size); +} + +#else /* CONFIG_FUSE_IO_URING */ + +struct fuse_ring; + +static inline void fuse_uring_conn_init(struct fuse_ring *ring, + struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) +{ +} + +#endif /* CONFIG_FUSE_IO_URING */ + +#endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 6c506f040d5f..e6289bafb788 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -7,6 +7,7 @@ #define _FS_FUSE_DEV_I_H #include <linux/types.h> +#include <linux/fs.h> /* Ordinary requests have even IDs, while interrupts IDs are odd */ #define FUSE_INT_REQ_BIT (1ULL << 0) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f23919610313..d2b058ccb677 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -917,6 +917,11 @@ struct fuse_conn { /** IDR for backing files ids */ struct idr backing_files_map; #endif + +#if IS_ENABLED(CONFIG_FUSE_IO_URING) + /** uring connection information*/ + struct fuse_ring *ring; +#endif }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 99e44ea7d875..33a080b24d65 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" #include <linux/pagemap.h> #include <linux/slab.h> @@ -947,6 +948,8 @@ static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + fuse_uring_conn_destruct(fc); + put_user_ns(fc->user_ns); fc->release(fc); } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d08b99d60f6f..0449640f2501 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1079,12 +1079,79 @@ struct fuse_backing_map { uint64_t padding; }; +enum fuse_uring_ioctl_cmd { + /* not correctly initialized when set */ + FUSE_URING_IOCTL_CMD_INVALID = 0, + + /* Ioctl to prepare communucation with io-uring */ + FUSE_URING_IOCTL_CMD_RING_CFG = 1, + + /* Ring queue configuration ioctl */ + FUSE_URING_IOCTL_CMD_QUEUE_CFG = 2, +}; + +enum fuse_uring_cfg_flags { + /* server/daemon side requests numa awareness */ + FUSE_URING_WANT_NUMA = 1ul << 0, +}; + +struct fuse_uring_cfg { + /* struct flags */ + uint64_t flags; + + /* configuration command */ + uint8_t cmd; + + uint8_t padding[7]; + + union { + struct fuse_ring_config { + /* number of queues */ + uint32_t nr_queues; + + /* number of foreground entries per queue */ + uint32_t sync_queue_depth; + + /* number of background entries per queue */ + uint32_t async_queue_depth; + + /* argument (max data length) of a request */ + uint32_t req_arg_len; + + /* + * buffer size userspace allocated per request buffer + * from the mmaped queue buffer + */ + uint32_t user_req_buf_sz; + + /* ring config flags */ + uint32_t numa_aware:1; + } rconf; + + struct fuse_ring_queue_config { + /* mmaped buffser address */ + uint64_t uaddr; + + /* qid the command is for */ + uint32_t qid; + + /* /dev/fuse fd that initiated the mount. */ + uint32_t control_fd; + } qconf; + + /* space for future additions */ + uint8_t union_size[128]; + }; +}; + /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_URING _IOR(FUSE_DEV_IOC_MAGIC, 3, \ + struct fuse_uring_cfg) struct fuse_lseek_in { uint64_t fh; @@ -1186,4 +1253,10 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Size of the ring buffer header + */ +#define FUSE_RING_HEADER_BUF_SIZE 4096 +#define FUSE_RING_MIN_IN_OUT_ARG_SIZE 4096 + #endif /* _LINUX_FUSE_H */ -- 2.40.1