On 1/22/25 16:56, Luis Henriques wrote: > On Mon, Jan 20 2025, Bernd Schubert wrote: > >> This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD). >> For now only FUSE_IO_URING_CMD_REGISTER is handled to register queue >> entries. > > Three comments below. > >> Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> >> Reviewed-by: Pavel Begunkov <asml.silence@xxxxxxxxx> # io_uring >> --- >> fs/fuse/Kconfig | 12 ++ >> fs/fuse/Makefile | 1 + >> fs/fuse/dev_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++++++ >> fs/fuse/dev_uring_i.h | 113 ++++++++++++++++ >> fs/fuse/fuse_i.h | 5 + >> fs/fuse/inode.c | 10 ++ >> include/uapi/linux/fuse.h | 76 ++++++++++- >> 7 files changed, 542 insertions(+), 1 deletion(-) >> >> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig >> index 8674dbfbe59dbf79c304c587b08ebba3cfe405be..ca215a3cba3e310d1359d069202193acdcdb172b 100644 >> --- a/fs/fuse/Kconfig >> +++ b/fs/fuse/Kconfig >> @@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH >> to be performed directly on a backing file. >> >> If you want to allow passthrough operations, answer Y. >> + >> +config FUSE_IO_URING >> + bool "FUSE communication over io-uring" >> + default y >> + depends on FUSE_FS >> + depends on IO_URING >> + help >> + This allows sending FUSE requests over the io-uring interface and >> + also adds request core affinity. >> + >> + If you want to allow fuse server/client communication through io-uring, >> + answer Y >> diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile >> index 2c372180d631eb340eca36f19ee2c2686de9714d..3f0f312a31c1cc200c0c91a086b30a8318e39d94 100644 >> --- a/fs/fuse/Makefile >> +++ b/fs/fuse/Makefile >> @@ -15,5 +15,6 @@ fuse-y += iomode.o >> fuse-$(CONFIG_FUSE_DAX) += dax.o >> fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o >> fuse-$(CONFIG_SYSCTL) += sysctl.o >> +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o >> >> virtiofs-y := virtio_fs.o >> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..60e38ff1ecef3b007bae7ceedd7dd997439e463a >> --- /dev/null >> +++ b/fs/fuse/dev_uring.c >> @@ -0,0 +1,326 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * FUSE: Filesystem in Userspace >> + * Copyright (c) 2023-2024 DataDirect Networks. >> + */ >> + >> +#include "fuse_i.h" >> +#include "dev_uring_i.h" >> +#include "fuse_dev_i.h" >> + >> +#include <linux/fs.h> >> +#include <linux/io_uring/cmd.h> >> + >> +static bool __read_mostly enable_uring; >> +module_param(enable_uring, bool, 0644); > > Allowing to change 'enable_uring' at runtime will cause troubles. I don't > think the code gracefully handles changes to this parameter. Maybe it > should be a read-only parameter. Another option would be to be proxy it > at the connection level, so that once enabled/disabled it will remain so > for the lifetime of that connection. Thank you, very good point. I'm adding a new patch that handles that, it depends on [PATCH v10 16/17] fuse: block request allocation until io-uring init is complete so added a new patch. Folding it in here would also be possible, dunno what Miklos prefers, especially as it is already in linux-next. > >> +MODULE_PARM_DESC(enable_uring, >> + "Enable userspace communication through io-uring"); >> + >> +#define FUSE_URING_IOV_SEGS 2 /* header and payload */ >> + >> + >> +bool fuse_uring_enabled(void) >> +{ >> + return enable_uring; >> +} >> + >> >> +void fuse_uring_destruct(struct fuse_conn *fc) >> +{ >> + struct fuse_ring *ring = fc->ring; >> + int qid; >> + >> + if (!ring) >> + return; >> + >> + for (qid = 0; qid < ring->nr_queues; qid++) { >> + struct fuse_ring_queue *queue = ring->queues[qid]; >> + >> + if (!queue) >> + continue; >> + >> + WARN_ON(!list_empty(&queue->ent_avail_queue)); >> + WARN_ON(!list_empty(&queue->ent_commit_queue)); >> + >> + kfree(queue); >> + ring->queues[qid] = NULL; >> + } >> + >> + kfree(ring->queues); >> + kfree(ring); >> + fc->ring = NULL; >> +} >> + >> +/* >> + * Basic ring setup for this connection based on the provided configuration >> + */ >> +static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) >> +{ >> + struct fuse_ring *ring; >> + size_t nr_queues = num_possible_cpus(); >> + struct fuse_ring *res = NULL; >> + size_t max_payload_size; >> + >> + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); >> + if (!ring) >> + return NULL; >> + >> + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), >> + GFP_KERNEL_ACCOUNT); >> + if (!ring->queues) >> + goto out_err; >> + >> + max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); >> + max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); >> + >> + spin_lock(&fc->lock); >> + if (fc->ring) { >> + /* race, another thread created the ring in the meantime */ >> + spin_unlock(&fc->lock); >> + res = fc->ring; >> + goto out_err; >> + } >> + >> + fc->ring = ring; >> + ring->nr_queues = nr_queues; >> + ring->fc = fc; >> + ring->max_payload_sz = max_payload_size; >> + >> + spin_unlock(&fc->lock); >> + return ring; >> + >> +out_err: >> + kfree(ring->queues); >> + kfree(ring); >> + return res; >> +} >> + >> +static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, >> + int qid) >> +{ >> + struct fuse_conn *fc = ring->fc; >> + struct fuse_ring_queue *queue; >> + >> + queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); >> + if (!queue) >> + return NULL; >> + queue->qid = qid; >> + queue->ring = ring; >> + spin_lock_init(&queue->lock); >> + >> + INIT_LIST_HEAD(&queue->ent_avail_queue); >> + INIT_LIST_HEAD(&queue->ent_commit_queue); >> + >> + spin_lock(&fc->lock); >> + if (ring->queues[qid]) { >> + spin_unlock(&fc->lock); >> + kfree(queue); >> + return ring->queues[qid]; >> + } >> + >> + /* >> + * write_once and lock as the caller mostly doesn't take the lock at all >> + */ >> + WRITE_ONCE(ring->queues[qid], queue); >> + spin_unlock(&fc->lock); >> + >> + return queue; >> +} >> + >> +/* >> + * Make a ring entry available for fuse_req assignment >> + */ >> +static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, >> + struct fuse_ring_queue *queue) >> +{ >> + WARN_ON_ONCE(!ent->cmd); >> + list_move(&ent->list, &queue->ent_avail_queue); >> + ent->state = FRRS_AVAILABLE; >> +} >> + >> +/* >> + * fuse_uring_req_fetch command handling >> + */ >> +static void fuse_uring_do_register(struct fuse_ring_ent *ent, >> + struct io_uring_cmd *cmd, >> + unsigned int issue_flags) >> +{ >> + struct fuse_ring_queue *queue = ent->queue; >> + >> + spin_lock(&queue->lock); >> + ent->cmd = cmd; >> + fuse_uring_ent_avail(ent, queue); >> + spin_unlock(&queue->lock); >> +} >> + >> +/* >> + * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] >> + * the payload >> + */ >> +static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, >> + struct iovec iov[FUSE_URING_IOV_SEGS]) >> +{ >> + struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); >> + struct iov_iter iter; >> + ssize_t ret; >> + >> + if (sqe->len != FUSE_URING_IOV_SEGS) >> + return -EINVAL; >> + >> + /* >> + * Direction for buffer access will actually be READ and WRITE, >> + * using write for the import should include READ access as well. >> + */ >> + ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, >> + FUSE_URING_IOV_SEGS, &iov, &iter); >> + if (ret < 0) >> + return ret; >> + >> + return 0; >> +} >> + >> +static struct fuse_ring_ent * >> +fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, >> + struct fuse_ring_queue *queue) >> +{ >> + struct fuse_ring *ring = queue->ring; >> + struct fuse_ring_ent *ent; >> + size_t payload_size; >> + struct iovec iov[FUSE_URING_IOV_SEGS]; >> + int err; >> + >> + err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); >> + if (err) { >> + pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", >> + err); >> + return ERR_PTR(err); >> + } >> + >> + err = -EINVAL; >> + if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { >> + pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); >> + return ERR_PTR(err); >> + } >> + >> + payload_size = iov[1].iov_len; >> + if (payload_size < ring->max_payload_sz) { >> + pr_info_ratelimited("Invalid req payload len %zu\n", >> + payload_size); >> + return ERR_PTR(err); >> + } >> + >> + /* >> + * The created queue above does not need to be destructed in >> + * case of entry errors below, will be done at ring destruction time. >> + */ > > This comment should probably be moved down, into function > fuse_uring_register() where this code was in earlier versions of the > patchset. Yeah, makes sense. > >> + err = -ENOMEM; >> + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); >> + if (!ent) >> + return ERR_PTR(err); >> + >> + INIT_LIST_HEAD(&ent->list); >> + >> + ent->queue = queue; >> + ent->headers = iov[0].iov_base; >> + ent->payload = iov[1].iov_base; >> + >> + return ent; >> +} >> + >> +/* >> + * Register header and payload buffer with the kernel and puts the >> + * entry as "ready to get fuse requests" on the queue >> + */ >> +static int fuse_uring_register(struct io_uring_cmd *cmd, >> + unsigned int issue_flags, struct fuse_conn *fc) >> +{ >> + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); >> + struct fuse_ring *ring = fc->ring; >> + struct fuse_ring_queue *queue; >> + struct fuse_ring_ent *ent; >> + int err; >> + unsigned int qid = READ_ONCE(cmd_req->qid); >> + >> + err = -ENOMEM; >> + if (!ring) { >> + ring = fuse_uring_create(fc); >> + if (!ring) >> + return err; >> + } >> + >> + if (qid >= ring->nr_queues) { >> + pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); >> + return -EINVAL; >> + } >> + >> + err = -ENOMEM; > > Not needed, 'err' is already set to this value. Hmm yeah, the style to set the error before is very inviting to do that. Thanks for spotting and your review, Bernd