Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()

Christoph Hellwig <hch@xxxxxxxxxxxxx> · Wed, 17 Nov 2021 00:39:16 -0800

On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote:
> This enables the block layer to send us a full plug list of requests
> that need submitting. The block layer guarantees that they all belong
> to the same queue, but we do have to check the hardware queue mapping
> for each request.
> 
> If errors are encountered, leave them in the passed in list. Then the
> block layer will handle them individually.
> 
> This is good for about a 4% improvement in peak performance, taking us
> from 9.6M to 10M IOPS/core.

The concept looks sensible, but the loop in nvme_queue_rqs is a complete
mess to follow. What about something like this (untested) on top?

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 13722cc400c2c..555a7609580c7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq,
 		nvmeq->sq_tail = 0;
 }
 
-/**
- * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
- * @nvmeq: The queue to use
- * @cmd: The command to send
- * @write_sq: whether to write to the SQ doorbell
- */
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
-			    bool write_sq)
-{
-	spin_lock(&nvmeq->sq_lock);
-	nvme_copy_cmd(nvmeq, cmd);
-	nvme_write_sq_db(nvmeq, write_sq);
-	spin_unlock(&nvmeq->sq_lock);
-}
-
 static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -918,8 +903,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
-				 struct request *req, struct nvme_command *cmnd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
@@ -928,18 +912,18 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
 	iod->npages = -1;
 	iod->nents = 0;
 
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
@@ -959,7 +943,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
 static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
@@ -976,12 +959,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!nvme_check_ready(&dev->ctrl, req, true))
 		return nvme_fail_nonready_command(&dev->ctrl, req);
 
-	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
-	if (ret == BLK_STS_OK) {
-		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
-		return BLK_STS_OK;
-	}
-	return ret;
+	ret = nvme_prep_rq(dev, req);
+	if (ret)
+		return ret;
+
+	spin_lock(&nvmeq->sq_lock);
+	nvme_copy_cmd(nvmeq, &iod->cmd);
+	nvme_write_sq_db(nvmeq, bd->last);
+	spin_unlock(&nvmeq->sq_lock);
+	return BLK_STS_OK;
 }
 
 static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
@@ -997,56 +983,47 @@ static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 {
-	struct request *requeue_list = NULL, *req, *prev = NULL;
-	struct blk_mq_hw_ctx *hctx;
-	struct nvme_queue *nvmeq;
-	struct nvme_ns *ns;
-
-restart:
-	req = rq_list_peek(rqlist);
-	hctx = req->mq_hctx;
-	nvmeq = hctx->driver_data;
-	ns = hctx->queue->queuedata;
-
 	/*
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
 	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return;
+		return false;
+	if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
+		return false;
 
-	rq_list_for_each(rqlist, req) {
-		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-		blk_status_t ret;
+	req->mq_hctx->tags->rqs[req->tag] = req;
+	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+}
 
-		if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
-			goto requeue;
+static void nvme_queue_rqs(struct request **rqlist)
+{
+	struct request *req = rq_list_peek(rqlist), *prev = NULL;
+	struct request *requeue_list = NULL;
+
+	do {
+		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+
+		if (!nvme_prep_rq_batch(nvmeq, req)) {
+			/* detach 'req' and add to remainder list */
+			if (prev)
+				prev->rq_next = req->rq_next;
+			rq_list_add(&requeue_list, req);
+		} else {
+			prev = req;
+		}
 
-		if (req->mq_hctx != hctx) {
+		req = rq_list_next(req);
+		if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
 			/* detach rest of list, and submit */
 			prev->rq_next = NULL;
 			nvme_submit_cmds(nvmeq, rqlist);
-			/* req now start of new list for this hw queue */
 			*rqlist = req;
-			goto restart;
-		}
-
-		hctx->tags->rqs[req->tag] = req;
-		ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd);
-		if (ret == BLK_STS_OK) {
-			prev = req;
-			continue;
 		}
-requeue:
-		/* detach 'req' and add to remainder list */
-		if (prev)
-			prev->rq_next = req->rq_next;
-		rq_list_add(&requeue_list, req);
-	}
+	} while (req);
 
-	nvme_submit_cmds(nvmeq, rqlist);
 	*rqlist = requeue_list;
 }
 
@@ -1224,7 +1201,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-	nvme_submit_cmd(nvmeq, &c, true);
+
+	spin_lock(&nvmeq->sq_lock);
+	nvme_copy_cmd(nvmeq, &c);
+	nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)