Implement ->queue_uring_cmd and ->poll_uring_cmd for leaner submission and completion. Both operate on "struct io_uring_cmd" directly, without having to take hoops involving request and bio. This is currently enabled for small (single segement length), premapped (fixedbufs) IOs. Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> --- drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 30d7a1a6eaab..ca0580c4e977 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,6 +28,8 @@ #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> #include <linux/pci-p2pdma.h> +#include <linux/nvme_ioctl.h> +#include <linux/io_uring.h> #include "trace.h" #include "nvme.h" @@ -210,6 +212,7 @@ struct nvme_queue { unsigned long *cmdid_bmp; spinlock_t cmdid_lock; struct nvme_iod *iod; + u8 reg_id; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; struct nvme_completion *cqes; @@ -249,7 +252,11 @@ union nvme_descriptor { * to the actual struct scatterlist. */ struct nvme_iod { - struct nvme_request req; + union { + struct nvme_request req; + /* for raw-queue */ + struct io_uring_cmd *ioucmd; + }; struct nvme_command cmd; bool aborted; s8 nr_allocations; /* PRP list pool allocations. 0 means small @@ -1025,6 +1032,13 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) writel(head, nvmeq->q_db + nvmeq->dev->db_stride); } +static void nvme_pci_put_cmdid(struct nvme_queue *nvmeq, int id) +{ + spin_lock(&nvmeq->cmdid_lock); + clear_bit(id, nvmeq->cmdid_bmp); + spin_unlock(&nvmeq->cmdid_lock); +} + static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) { if (!nvmeq->qid) @@ -1032,6 +1046,37 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } +static inline struct nvme_uring_direct_pdu *nvme_uring_cmd_direct_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_direct_pdu *)&ioucmd->pdu; +} + +static inline void nvme_handle_cqe_rawq(struct nvme_queue *nvmeq, + struct nvme_completion *cqe, + __u16 command_id) +{ + struct nvme_dev *dev = nvmeq->dev; + u32 status = le16_to_cpu(cqe->status) >> 1; + u64 result = cqe->result.u64; + struct nvme_iod *iod; + int id, reg_id; + + reg_id = nvme_genctr_from_cid(command_id); + /* we should not encounter completions from past registration*/ + WARN_ON_ONCE(nvmeq->reg_id != reg_id); + + id = nvme_tag_from_cid(command_id); + iod = &nvmeq->iod[id]; + + if (iod->dma_len) + dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, + nvme_is_write(&iod->cmd) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + nvme_pci_put_cmdid(nvmeq, id); + io_uring_cmd_done(iod->ioucmd, status, result, IO_URING_F_UNLOCKED); +} + static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, struct io_comp_batch *iob, u16 idx) { @@ -1039,6 +1084,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, __u16 command_id = READ_ONCE(cqe->command_id); struct request *req; + if (test_bit(NVMEQ_RAW, &nvmeq->flags)) + return nvme_handle_cqe_rawq(nvmeq, cqe, command_id); + /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to @@ -1151,6 +1199,25 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return found; } +static int nvme_poll_uring_cmd(struct io_uring_cmd *ioucmd, int qid, + struct io_comp_batch *iob) + +{ + struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd); + struct nvme_dev *dev = to_nvme_dev(pdu->ns->ctrl); + struct nvme_queue *nvmeq = &dev->queues[qid]; + bool found; + + if (!nvme_cqe_pending(nvmeq)) + return 0; + + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_poll_cq(nvmeq, iob); + spin_unlock(&nvmeq->cq_poll_lock); + + return found; +} + static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); @@ -1762,6 +1829,22 @@ static int nvme_pci_alloc_cmdid_bmp(struct nvme_queue *nvmeq) return 0; } +static int nvme_pci_get_cmdid(struct nvme_queue *nvmeq) +{ + int id = 0, size = nvmeq->q_depth - 1; + + spin_lock(&nvmeq->cmdid_lock); + id = find_first_zero_bit(nvmeq->cmdid_bmp, size); + if (id >= size) { + id = -EBUSY; + goto unlock; + } + set_bit(id, nvmeq->cmdid_bmp); +unlock: + spin_unlock(&nvmeq->cmdid_lock); + return id; +} + static int nvme_pci_alloc_iod_array(struct nvme_queue *nvmeq) { if (!test_bit(NVMEQ_RAW, &nvmeq->flags)) @@ -1793,13 +1876,17 @@ static int nvme_pci_register_queue(void *data) struct nvme_ns *ns = (struct nvme_ns *) data; struct nvme_dev *dev = to_nvme_dev(ns->ctrl); int qid, ret; + struct nvme_queue *nvmeq; qid = nvme_pci_get_rawq(dev); if (qid > 0) { /* setup command-id bitmap and iod array */ - ret = nvme_pci_setup_rawq(&dev->queues[qid]); + nvmeq = &dev->queues[qid]; + ret = nvme_pci_setup_rawq(nvmeq); if (ret < 0) qid = ret; + else + nvmeq->reg_id++; } return qid; } @@ -1812,6 +1899,107 @@ static int nvme_pci_unregister_queue(void *data, int qid) return nvme_pci_put_rawq(dev, qid); } +static int nvme_map_io_fb(struct request_queue *q, struct io_uring_cmd *ioucmd, + struct nvme_dev *dev, struct nvme_iod *iod, + unsigned long addr, unsigned int buf_len) +{ + struct nvme_command *cmnd = &iod->cmd; + int ret, rw = nvme_is_write(cmnd); + struct iov_iter iter; + size_t nr_iter, nr_segs; + struct bio_vec *bv; + + if (!(ioucmd->flags & IORING_URING_CMD_FIXED)) + return -EINVAL; + + ret = io_uring_cmd_import_fixed(addr, buf_len, rw, &iter, ioucmd); + if (ret < 0) + return ret; + nr_iter = iov_iter_count(&iter); + nr_segs = iter.nr_segs; + + /* device will do these checks anyway, so why do duplicate work? */ + if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) + goto err; + if (nr_segs > queue_max_segments(q)) + goto err; + /* this will be attempted from regular path instead */ + if (nr_segs > 1) + goto err; + + bv = (struct bio_vec *)&iter.bvec[0]; + if (bv->bv_offset + bv->bv_len <= NVME_CTRL_PAGE_SIZE * 2) + return nvme_setup_prp_simple(dev, iod, + (struct nvme_rw_command *)cmnd, + bv, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +err: + return -EINVAL; +} + +static int nvme_alloc_cmd_from_q(struct nvme_queue *nvmeq, int *cmdid, + struct nvme_iod **iod) +{ + int id; + + id = nvme_pci_get_cmdid(nvmeq); + if (id < 0) + return id; + *cmdid = id | nvme_cid_install_genctr(nvmeq->reg_id); + *iod = &nvmeq->iod[id]; + return 0; +} + +static int nvme_pci_queue_ucmd(struct io_uring_cmd *ioucmd, int qid) +{ + struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd); + struct nvme_ns *ns = pdu->ns; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_command *nvme_cmd; + struct nvme_uring_data d; + int ret, cmdid; + struct nvme_iod *iod; + struct nvme_queue *nvmeq = &dev->queues[qid]; + + ret = nvme_alloc_cmd_from_q(nvmeq, &cmdid, &iod); + if (ret) + return ret; + + iod->ioucmd = ioucmd; + nvme_cmd = &iod->cmd; + ret = nvme_prep_cmd_from_ioucmd(ctrl, ns, ioucmd, nvme_cmd, &d); + if (ret) + goto out; + + nvme_cmd->common.command_id = cmdid; + iod->aborted = false; + iod->nr_allocations = -1; + iod->sgt.nents = 0; + ret = nvme_map_io_fb(ns->queue, ioucmd, dev, iod, d.addr, d.data_len); + if (ret) + goto out; + + /* + * since this nvmeq is exclusive to single submitter (io_uring + * follows one-thread-per-ring model), we do not need the lock + * ideally. But we have lifetime difference between io_uring + * and nvme sqe. io_uring SQE can be reused just after submission + * but for nvme we have to wait until completion. + * So if we run out of space, submission will be deferred to + * io_uring worker. So we can't skip the lock. + * But not all is lost! Due to one-thread-per-ring model and + * polled-completion, contention is not common in most cases. + */ + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); + return -EIOCBQUEUED; +out: + nvme_pci_put_cmdid(nvmeq, cmdid); + return ret; +} + static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, @@ -1832,6 +2020,8 @@ static const struct blk_mq_ops nvme_mq_ops = { .poll = nvme_poll, .register_queue = nvme_pci_register_queue, .unregister_queue = nvme_pci_unregister_queue, + .queue_uring_cmd = nvme_pci_queue_ucmd, + .poll_uring_cmd = nvme_poll_uring_cmd, }; static void nvme_dev_remove_admin(struct nvme_dev *dev) -- 2.25.1