On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote: > This enables the block layer to send us a full plug list of requests > that need submitting. The block layer guarantees that they all belong > to the same queue, but we do have to check the hardware queue mapping > for each request. > > If errors are encountered, leave them in the passed in list. Then the > block layer will handle them individually. > > This is good for about a 4% improvement in peak performance, taking us > from 9.6M to 10M IOPS/core. The concept looks sensible, but the loop in nvme_queue_rqs is a complete mess to follow. What about something like this (untested) on top? diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 13722cc400c2c..555a7609580c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = 0; } -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * @write_sq: whether to write to the SQ doorbell - */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - bool write_sq) -{ - spin_lock(&nvmeq->sq_lock); - nvme_copy_cmd(nvmeq, cmd); - nvme_write_sq_db(nvmeq, write_sq); - spin_unlock(&nvmeq->sq_lock); -} - static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; @@ -918,8 +903,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; } -static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) +static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; @@ -928,18 +912,18 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, iod->npages = -1; iod->nents = 0; - ret = nvme_setup_cmd(ns, req); + ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, cmnd); + ret = nvme_map_data(dev, req, &iod->cmd); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, cmnd); + ret = nvme_map_metadata(dev, req, &iod->cmd); if (ret) goto out_unmap_data; } @@ -959,7 +943,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; @@ -976,12 +959,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (!nvme_check_ready(&dev->ctrl, req, true)) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_prep_rq(dev, ns, req, &iod->cmd); - if (ret == BLK_STS_OK) { - nvme_submit_cmd(nvmeq, &iod->cmd, bd->last); - return BLK_STS_OK; - } - return ret; + ret = nvme_prep_rq(dev, req); + if (ret) + return ret; + + spin_lock(&nvmeq->sq_lock); + nvme_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, bd->last); + spin_unlock(&nvmeq->sq_lock); + return BLK_STS_OK; } static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) @@ -997,56 +983,47 @@ static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) spin_unlock(&nvmeq->sq_lock); } -static void nvme_queue_rqs(struct request **rqlist) +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) { - struct request *requeue_list = NULL, *req, *prev = NULL; - struct blk_mq_hw_ctx *hctx; - struct nvme_queue *nvmeq; - struct nvme_ns *ns; - -restart: - req = rq_list_peek(rqlist); - hctx = req->mq_hctx; - nvmeq = hctx->driver_data; - ns = hctx->queue->queuedata; - /* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. */ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) - return; + return false; + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + return false; - rq_list_for_each(rqlist, req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret; + req->mq_hctx->tags->rqs[req->tag] = req; + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; +} - if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) - goto requeue; +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *req = rq_list_peek(rqlist), *prev = NULL; + struct request *requeue_list = NULL; + + do { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (!nvme_prep_rq_batch(nvmeq, req)) { + /* detach 'req' and add to remainder list */ + if (prev) + prev->rq_next = req->rq_next; + rq_list_add(&requeue_list, req); + } else { + prev = req; + } - if (req->mq_hctx != hctx) { + req = rq_list_next(req); + if (!req || (prev && req->mq_hctx != prev->mq_hctx)) { /* detach rest of list, and submit */ prev->rq_next = NULL; nvme_submit_cmds(nvmeq, rqlist); - /* req now start of new list for this hw queue */ *rqlist = req; - goto restart; - } - - hctx->tags->rqs[req->tag] = req; - ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd); - if (ret == BLK_STS_OK) { - prev = req; - continue; } -requeue: - /* detach 'req' and add to remainder list */ - if (prev) - prev->rq_next = req->rq_next; - rq_list_add(&requeue_list, req); - } + } while (req); - nvme_submit_cmds(nvmeq, rqlist); *rqlist = requeue_list; } @@ -1224,7 +1201,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c, true); + + spin_lock(&nvmeq->sq_lock); + nvme_copy_cmd(nvmeq, &c); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)