This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD). For now only FUSE_URING_REQ_FETCH is handled to register queue entries. Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> --- fs/fuse/dev.c | 1 + fs/fuse/dev_uring.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 12 +++ include/uapi/linux/fuse.h | 33 ++++++ 4 files changed, 313 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd5dc6ae9272..05a87731b5c3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2510,6 +2510,7 @@ const struct file_operations fuse_dev_operations = { .compat_ioctl = compat_ptr_ioctl, #if IS_ENABLED(CONFIG_FUSE_IO_URING) .mmap = fuse_uring_mmap, + .uring_cmd = fuse_uring_cmd, #endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 2c0ccb378908..48b1118b64f4 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -31,6 +31,27 @@ #include <linux/topology.h> #include <linux/io_uring/cmd.h> +static void fuse_ring_ring_ent_unset_userspace(struct fuse_ring_ent *ent) +{ + clear_bit(FRRS_USERSPACE, &ent->state); + list_del_init(&ent->list); +} + +/* Update conn limits according to ring values */ +static void fuse_uring_conn_cfg_limits(struct fuse_ring *ring) +{ + struct fuse_conn *fc = ring->fc; + + WRITE_ONCE(fc->max_pages, min_t(unsigned int, fc->max_pages, + ring->req_arg_len / PAGE_SIZE)); + + /* This not ideal, as multiplication with nr_queue assumes the limit + * gets reached when all queues are used, but a single threaded + * application might already do that. + */ + WRITE_ONCE(fc->max_background, ring->nr_queues * ring->max_nr_async); +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -329,3 +350,249 @@ int fuse_uring_queue_cfg(struct fuse_ring *ring, return 0; } +/* + * Put a ring request onto hold, it is no longer used for now. + */ +static void fuse_uring_ent_avail(struct fuse_ring_ent *ring_ent, + struct fuse_ring_queue *queue) + __must_hold(&queue->lock) +{ + struct fuse_ring *ring = queue->ring; + + /* unsets all previous flags - basically resets */ + pr_devel("%s ring=%p qid=%d tag=%d state=%lu async=%d\n", __func__, + ring, ring_ent->queue->qid, ring_ent->tag, ring_ent->state, + ring_ent->async); + + if (WARN_ON(test_bit(FRRS_USERSPACE, &ring_ent->state))) { + pr_warn("%s qid=%d tag=%d state=%lu async=%d\n", __func__, + ring_ent->queue->qid, ring_ent->tag, ring_ent->state, + ring_ent->async); + return; + } + + WARN_ON_ONCE(!list_empty(&ring_ent->list)); + + if (ring_ent->async) + list_add(&ring_ent->list, &queue->async_ent_avail_queue); + else + list_add(&ring_ent->list, &queue->sync_ent_avail_queue); + + set_bit(FRRS_WAIT, &ring_ent->state); +} + +/* + * fuse_uring_req_fetch command handling + */ +static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent, + struct io_uring_cmd *cmd, unsigned int issue_flags) +__must_hold(ring_ent->queue->lock) +{ + struct fuse_ring_queue *queue = ring_ent->queue; + struct fuse_ring *ring = queue->ring; + int ret = 0; + int nr_ring_sqe; + + /* register requests for foreground requests first, then backgrounds */ + if (queue->nr_req_sync >= ring->max_nr_sync) { + queue->nr_req_async++; + ring_ent->async = 1; + } else + queue->nr_req_sync++; + + fuse_uring_ent_avail(ring_ent, queue); + + if (queue->nr_req_sync + queue->nr_req_async > ring->queue_depth) { + /* should be caught by ring state before and queue depth + * check before + */ + WARN_ON(1); + pr_info("qid=%d tag=%d req cnt (fg=%d async=%d exceeds depth=%zu", + queue->qid, ring_ent->tag, queue->nr_req_sync, + queue->nr_req_async, ring->queue_depth); + ret = -ERANGE; + } + + if (ret) + goto out; /* erange */ + + WRITE_ONCE(ring_ent->cmd, cmd); + + nr_ring_sqe = ring->queue_depth * ring->nr_queues; + if (atomic_inc_return(&ring->nr_sqe_init) == nr_ring_sqe) { + fuse_uring_conn_cfg_limits(ring); + ring->ready = 1; + } + +out: + return ret; +} + +static struct fuse_ring_queue * +fuse_uring_get_verify_queue(struct fuse_ring *ring, + const struct fuse_uring_cmd_req *cmd_req, + unsigned int issue_flags) +{ + struct fuse_conn *fc = ring->fc; + struct fuse_ring_queue *queue; + int ret; + + if (!(issue_flags & IO_URING_F_SQE128)) { + pr_info("qid=%d tag=%d SQE128 not set\n", cmd_req->qid, + cmd_req->tag); + ret = -EINVAL; + goto err; + } + + if (unlikely(!fc->connected)) { + ret = -ENOTCONN; + goto err; + } + + if (unlikely(!ring->configured)) { + pr_info("command for a connection that is not ring configured\n"); + ret = -ENODEV; + goto err; + } + + if (unlikely(cmd_req->qid >= ring->nr_queues)) { + pr_devel("qid=%u >= nr-queues=%zu\n", cmd_req->qid, + ring->nr_queues); + ret = -EINVAL; + goto err; + } + + queue = fuse_uring_get_queue(ring, cmd_req->qid); + if (unlikely(queue == NULL)) { + pr_info("Got NULL queue for qid=%d\n", cmd_req->qid); + ret = -EIO; + goto err; + } + + if (unlikely(!queue->configured || queue->stopped)) { + pr_info("Ring or queue (qid=%u) not ready.\n", cmd_req->qid); + ret = -ENOTCONN; + goto err; + } + + if (cmd_req->tag > ring->queue_depth) { + pr_info("tag=%u > queue-depth=%zu\n", cmd_req->tag, + ring->queue_depth); + ret = -EINVAL; + goto err; + } + + return queue; + +err: + return ERR_PTR(ret); +} + +/** + * Entry function from io_uring to handle the given passthrough command + * (op cocde IORING_OP_URING_CMD) + */ +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_dev *fud = fuse_get_dev(cmd->file); + struct fuse_conn *fc = fud->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ring_ent = NULL; + u32 cmd_op = cmd->cmd_op; + int ret = 0; + + if (!ring) { + ret = -ENODEV; + goto out; + } + + queue = fuse_uring_get_verify_queue(ring, cmd_req, issue_flags); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto out; + } + + ring_ent = &queue->ring_ent[cmd_req->tag]; + + pr_devel("%s:%d received: cmd op %d qid %d (%p) tag %d (%p)\n", + __func__, __LINE__, cmd_op, cmd_req->qid, queue, cmd_req->tag, + ring_ent); + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + /* XXX how to ensure queue still exists? Add + * an rw ring->stop lock? And take that at the beginning + * of this function? Better would be to advise uring + * not to call this function at all? Or free the queue memory + * only, on daemon PF_EXITING? + */ + ret = -ENOTCONN; + goto err_unlock; + } + + if (current == queue->server_task) + queue->uring_cmd_issue_flags = issue_flags; + + switch (cmd_op) { + case FUSE_URING_REQ_FETCH: + if (queue->server_task == NULL) { + queue->server_task = current; + queue->uring_cmd_issue_flags = issue_flags; + } + + /* No other bit must be set here */ + if (ring_ent->state != BIT(FRRS_INIT)) { + pr_info_ratelimited( + "qid=%d tag=%d register req state %lu expected %lu", + cmd_req->qid, cmd_req->tag, ring_ent->state, + BIT(FRRS_INIT)); + ret = -EINVAL; + goto err_unlock; + } + + fuse_ring_ring_ent_unset_userspace(ring_ent); + + ret = fuse_uring_fetch(ring_ent, cmd, issue_flags); + if (ret) + goto err_unlock; + + /* + * The ring entry is registered now and needs to be handled + * for shutdown. + */ + atomic_inc(&ring->queue_refs); + + spin_unlock(&queue->lock); + break; + default: + ret = -EINVAL; + pr_devel("Unknown uring command %d", cmd_op); + goto err_unlock; + } +out: + pr_devel("uring cmd op=%d, qid=%d tag=%d ret=%d\n", cmd_op, + cmd_req->qid, cmd_req->tag, ret); + + if (ret < 0) { + if (ring_ent != NULL) { + pr_info_ratelimited("error: uring cmd op=%d, qid=%d tag=%d ret=%d\n", + cmd_op, cmd_req->qid, cmd_req->tag, + ret); + + /* must not change the entry state, as userspace + * might have sent random data, but valid requests + * might be registered already - don't confuse those. + */ + } + io_uring_cmd_done(cmd, ret, 0, issue_flags); + } + + return -EIOCBQUEUED; + +err_unlock: + spin_unlock(&queue->lock); + goto out; +} + diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 114e9c008013..b2be67bb2fa7 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -203,6 +203,7 @@ int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma); int fuse_uring_queue_cfg(struct fuse_ring *ring, struct fuse_ring_queue_config *qcfg); void fuse_uring_ring_destruct(struct fuse_ring *ring); +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); static inline void fuse_uring_conn_init(struct fuse_ring *ring, struct fuse_conn *fc) @@ -269,6 +270,11 @@ static inline bool fuse_uring_configured(struct fuse_conn *fc) return false; } +static inline bool fuse_per_core_queue(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->per_core_queue; +} + #else /* CONFIG_FUSE_IO_URING */ struct fuse_ring; @@ -287,6 +293,12 @@ static inline bool fuse_uring_configured(struct fuse_conn *fc) return false; } +static inline bool fuse_per_core_queue(struct fuse_conn *fc) +{ + return false; +} + + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 88d4078c4171..379388c964a7 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1262,6 +1262,12 @@ struct fuse_supp_groups { /* The offset parameter is used to identify the request type */ #define FUSE_URING_MMAP_OFF 0xf8000000ULL +/* + * Request is background type. Daemon side is free to use this information + * to handle foreground/background CQEs with different priorities. + */ +#define FUSE_RING_REQ_FLAG_ASYNC (1ull << 0) + /** * This structure mapped onto the */ @@ -1288,4 +1294,31 @@ struct fuse_ring_req { char in_out_arg[]; }; +/** + * sqe commands to the kernel + */ +enum fuse_uring_cmd { + FUSE_URING_REQ_INVALID = 0, + + /* submit sqe to kernel to get a request */ + FUSE_URING_REQ_FETCH = 1, + + /* commit result and fetch next request */ + FUSE_URING_REQ_COMMIT_AND_FETCH = 2, +}; + +/** + * In the 80B command area of the SQE. + */ +struct fuse_uring_cmd_req { + /* queue the command is for (queue index) */ + uint16_t qid; + + /* queue entry (array index) */ + uint16_t tag; + + /* pointer to struct fuse_uring_buf_req */ + uint32_t flags; +}; + #endif /* _LINUX_FUSE_H */ -- 2.40.1