This just adds a design document and data structures needed by later commits to support kernel/userspace communication using the uring IORING_OP_URING_CMD command. Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> cc: Miklos Szeredi <miklos@xxxxxxxxxx> cc: linux-fsdevel@xxxxxxxxxxxxxxx cc: Amir Goldstein <amir73il@xxxxxxxxx> cc: fuse-devel@xxxxxxxxxxxxxxxxxxxxx --- Documentation/filesystems/fuse-uring.rst | 179 +++++++++++++++++++++++ include/uapi/linux/fuse.h | 131 +++++++++++++++++ 2 files changed, 310 insertions(+) create mode 100644 Documentation/filesystems/fuse-uring.rst diff --git a/Documentation/filesystems/fuse-uring.rst b/Documentation/filesystems/fuse-uring.rst new file mode 100644 index 000000000000..088b97bbc289 --- /dev/null +++ b/Documentation/filesystems/fuse-uring.rst @@ -0,0 +1,179 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +FUSE Uring design documentation +============================== + +This documentation covers basic details how the fuse +kernel/userspace communication through uring is configured +and works. For generic details about FUSE see fuse.rst. + +This document also covers the current interface, which is +still in development and might change. + +Limitations +=========== +As of now not all requests types are supported through uring, userspace +side is required to also handle requests through /dev/fuse after +uring setup is complete. These are especially notifications (initiated +from daemon side), interrupts and forgets. +Interrupts are probably not working at all when uring is used. At least +current state of libfuse will not be able to handle those for requests +on ring queues. +All these limitation will be addressed later. + +Fuse uring configuration +======================== + +Fuse kernel requests are queued through the classical /dev/fuse +read/write interface - until uring setup is complete. + +IOCTL configuration +------------------- + +Userspace daemon side has to initiate ring confuration through +the FUSE_DEV_IOC_URING ioctl, with cmd FUSE_URING_IOCTL_CMD_QUEUE_CFG. + +Number of queues can be + - 1 + - One ring for all cores and all requests. + - Number of cores + - One ring per core, requests are queued on the ring queue + that is submitting the request. Especially for background + requests we might consider to use queues of other cores + as well - future work. + - Kernel and userspace have to agree on the number of cores, + on mismatch the ioctl is rejected. + - For each queue a separate ioctl needs to be send. + +Example: + +fuse_uring_configure_kernel_queue() +{ + struct fuse_uring_cfg ioc_cfg = { + .cmd = FUSE_URING_IOCTL_CMD_QUEUE_CFG, + .qid = 2, + .nr_queues = 3, + .fg_queue_depth = 16, + .bg_queue_depth = 4, + .req_arg_len = 1024 * 1024, + .numa_node_id = 1, + }; + + rc = ioctl(se->fd, FUSE_DEV_IOC_URING, &ioc_cfg); +} + + +On kernel side the first ioctl that arrives configures the basic fuse ring +and then its queue id. All further ioctls only their queue. Each queue gets +a memory allocation that is then assigned per queue entry. + +MMAP +==== + +For shared memory communication allocated memory per queue is mmaped with +mmap. The corresponding queue is identified with the offset parameter. +Important is a strict agreement between kernel and userspace daemon side +on memory assignment per queue entry - a mismatch would lead to data +corruption. +Ideal would be an mmap per ring entry and to verify the pointer on SQE +submission, but the result obtained in the file_operations::mmap method +is scrambled further down the stack - fuse kernel does not know the exact +pointer value returned to mmap initiated by userspace. + + +Kernel - userspace interface using uring +======================================== + +After queue ioctl setup and memory mapping userspace submits +SQEs (opcode = IORING_OP_URING_CMD) in order to fetch +fuse requests. Initial submit is with the sub command +FUSE_URING_REQ_FETCH, which will just register entries +to be available on the kernel side - it sets the according +entry state and marks the entry as available in the queue bitmap. + +Once all entries for all queues are submitted kernel side starts +to enqueue to ring queue(s). The request is copied into the shared +memory queue entry buffer and submitted as CQE to the userspace +side. +Userspace side handles the CQE and submits the result as subcommand +FUSE_URING_REQ_COMMIT_AND_FETCH - kernel side does completes the requests +and also marks the queue entry as available again. If there are +pending requests waiting the request will be immediately submitted +to userspace again. + +Initial SQE +----------- + + | | FUSE filesystem daemon + | | + | | >io_uring_submit() + | | IORING_OP_URING_CMD / + | | FUSE_URING_REQ_FETCH + | | [wait cqe] + | | >io_uring_wait_cqe() or + | | >io_uring_submit_and_wait() + | | + | >fuse_uring_cmd() | + | >fuse_uring_fetch() | + | >fuse_uring_ent_release() | + + +Sending requests with CQEs +-------------------------- + + | | FUSE filesystem daemon + | | [waiting for CQEs] + | "rm /mnt/fuse/file" | + | | + | >sys_unlink() | + | >fuse_unlink() | + | [allocate request] | + | >__fuse_request_send() | + | ... | + | >fuse_uring_queue_fuse_req | + | [queue request on fg or | + | bg queue] | + | >fuse_uring_assign_ring_entry() | + | >fuse_uring_send_to_ring() | + | >fuse_uring_copy_to_ring() | + | >io_uring_cmd_done() | + | >request_wait_answer() | + | [sleep on req->waitq] | + | | [receives and handles CQE] + | | [submit result and fetch next] + | | >io_uring_submit() + | | IORING_OP_URING_CMD/ + | | FUSE_URING_REQ_COMMIT_AND_FETCH + | >fuse_uring_cmd() | + | >fuse_uring_commit_and_release() | + | >fuse_uring_copy_from_ring() | + | [ copy the result to the fuse req] | + | >fuse_uring_req_end_and_get_next() | + | >fuse_request_end() | + | [wake up req->waitq] | + | >fuse_uring_ent_release_and_fetch()| + | [wait or handle next req] | + | | + | | + | [req->waitq woken up] | + | <fuse_unlink() | + | <sys_unlink() | + + +Shutdown +======== + +A dayled workqueue is started when the ring gets configured with ioctls and +runs periodically to complete ring entries on umount or daemon stop. +See fuse_uring_stop_mon() and subfunctions for details - basically it needs +to run io_uring_cmd_done() for waiting SQEs and fuse_request_end() for +queue entries that have a fuse request assigned. + +In order to avoid periodic cpu cycles for shutdown the userspace daemon can +create a thread and submit that thread into a waiting state with the +FUSE_DEV_IOC_URING ioctl and FUSE_URING_IOCTL_CMD_WAIT subcommand. +Kernel side will stop the periodic waiter on receiving this ioctl +and will go into a waitq. On umount or daemon termination it will +wake up and start the delayed stop workq again before returning to +userspace. diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e3c54109bae9..0f59507b4b18 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -966,9 +966,64 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; + +enum fuse_uring_ioctl_cmd { + /* not correctly initialized when set */ + FUSE_URING_IOCTL_CMD_INVALID = 0, + + /* The ioctl is a queue configuration command */ + FUSE_URING_IOCTL_CMD_QUEUE_CFG = 1, + + /* Wait in the kernel until the process gets terminated, process + * termination will wake up the waitq and initiate ring shutdown. + * This avoids the need to run a check in intervals if ring termination + * should be started (less cpu cycles) and also helps for faster ring + * shutdown. + */ + FUSE_URING_IOCTL_CMD_WAIT = 2, + + /* Daemon side wants to explicitly stop the waiter thread. This will + * restart the interval termination checker. + */ + FUSE_URING_IOCTL_CMD_STOP = 3, +}; + +struct fuse_uring_cfg { + /* currently unused */ + uint32_t flags; + + /* configuration command */ + uint16_t cmd; + + uint16_t padding; + + /* qid the config command is for */ + uint32_t qid; + + /* number of queues */ + uint32_t nr_queues; + + /* number of foreground entries per queue */ + uint32_t fg_queue_depth; + + /* number of background entries per queue */ + uint32_t bg_queue_depth; + + /* argument (data length) of a request */ + uint32_t req_arg_len; + + /* numa node this queue runs on; UINT32_MAX if any*/ + uint32_t numa_node_id; + + /* reserved space for future additions */ + uint64_t reserve[8]; +}; + /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) +#define FUSE_DEV_IOC_URING _IOR(FUSE_DEV_IOC_MAGIC, 1, \ + struct fuse_uring_cfg) struct fuse_lseek_in { uint64_t fh; @@ -1047,4 +1102,80 @@ struct fuse_secctx_header { uint32_t nr_secctx; }; + +/** + * Size of the ring buffer header + */ +#define FUSE_RING_HEADER_BUF_SIZE 4096 +#define FUSE_RING_MIN_IN_OUT_ARG_SIZE 4096 + +enum fuse_ring_req_cmd { + FUSE_RING_BUF_CMD_INVALID = 0, + + /* return an iovec pointer */ + FUSE_RING_BUF_CMD_IOVEC = 1, + + /* report an error */ + FUSE_RING_BUF_CMD_ERROR = 2, +}; + +/* Request is background type. Daemon side is free to use this information + * to handle foreground/background CQEs with different priorities. + */ +#define FUSE_RING_REQ_FLAG_BACKGROUND (1ull << 0) + +/** + * This structure mapped onto the + */ +struct fuse_ring_req { + + union { + /* The first 4K are command data */ + char ring_header[FUSE_RING_HEADER_BUF_SIZE]; + + struct { + uint64_t flags; + + /* enum fuse_ring_buf_cmd */ + uint32_t cmd; + uint32_t in_out_arg_len; + + /* kernel fills in, reads out */ + union { + struct fuse_in_header in; + struct fuse_out_header out; + }; + }; + }; + + char in_out_arg[]; +}; + +/** + * sqe commands to the kernel + */ +enum fuse_uring_cmd { + FUSE_URING_REQ_INVALID = 0, + + /* submit sqe to kernel to get a request */ + FUSE_URING_REQ_FETCH = 1, + + /* commit result and fetch next request */ + FUSE_URING_REQ_COMMIT_AND_FETCH = 2, +}; + +/** + * In the 80B command area of the SQE. + */ +struct fuse_uring_cmd_req { + /* queue the command is for (queue index) */ + uint16_t qid; + + /* queue entry (array index) */ + uint16_t tag; + + /* pointer to struct fuse_uring_buf_req */ + uint32_t padding; +}; + #endif /* _LINUX_FUSE_H */ -- 2.37.2