Re: [PATCH 12/15] scsi: initial blk-mq support

Sagi Grimberg <sagig@xxxxxxxxxxxxxxxxxx> · Thu, 06 Feb 2014 10:38:17 +0200

On 2/5/2014 2:41 PM, Christoph Hellwig wrote:
Add support for using the blk-mq code to submit requests to SCSI
drivers.  There is very little blk-mq specific code, but that's
partially because important functionality like partial completions
and request requeueing is still missing in blk-mq.  I hope to keep
most of the additions for these in the blk-mq core instead of the
SCSI layer, though.

Based on the earlier scsi-mq prototype by Nicholas Bellinger, although
not a whole lot of actual code is left.

Not-quite-signed-off-yet-by: Christoph Hellwig <hch@xxxxxx>
---
  drivers/scsi/scsi.c      |   36 ++++++-
  drivers/scsi/scsi_lib.c  |  244 ++++++++++++++++++++++++++++++++++++++++++++--
  drivers/scsi/scsi_priv.h |    2 +
  drivers/scsi/scsi_scan.c |    5 +-
  include/scsi/scsi_host.h |    3 +
  5 files changed, 278 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index adb8bfb..cf5c110 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -44,6 +44,7 @@
  #include <linux/string.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  #include <linux/delay.h>
  #include <linux/init.h>
  #include <linux/completion.h>
@@ -688,6 +689,33 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
  	return 0;
  }
  
+static void scsi_softirq_done_remote(void *data)
+{
+	return scsi_softirq_done(data);
+}
+
+static void scsi_mq_done(struct request *req)
+{
+	int cpu;
+
+#if 0
+	if (!ctx->ipi_redirect)
+		return scsi_softirq_done(cmd);
+#endif
+
+	cpu = get_cpu();
+	if (cpu != req->cpu && cpu_online(req->cpu)) {
+		req->csd.func = scsi_softirq_done_remote;
+		req->csd.info = req;
+		req->csd.flags = 0;
+		__smp_call_function_single(req->cpu, &req->csd, 0);
+	} else {
+		scsi_softirq_done(req);
+	}
+
+	put_cpu();
+}
+
  /**
   * scsi_done - Invoke completion on finished SCSI command.
   * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
@@ -701,8 +729,14 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
   */
  static void scsi_done(struct scsi_cmnd *cmd)
  {
+	struct request *req = cmd->request;
+
  	trace_scsi_dispatch_cmd_done(cmd);
-	blk_complete_request(cmd->request);
+
+	if (req->mq_ctx)
+		scsi_mq_done(req);
+	else
+		blk_complete_request(req);
  }
  
  /**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e67950c..8dd8893 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -20,6 +20,7 @@
  #include <linux/delay.h>
  #include <linux/hardirq.h>
  #include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
  
  #include <scsi/scsi.h>
  #include <scsi/scsi_cmnd.h>
@@ -554,6 +555,15 @@ static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
  	struct request *req = cmd->request;
  
  	/*
+	 * XXX: need to handle partial completions and retries here.
+	 */
+	if (req->mq_ctx) {
+		blk_mq_end_io(req, error);
+		put_device(&cmd->device->sdev_gendev);
+		return true;
+	}
+
+	/*
  	 * If there are blocks left over at the end, set up the command
  	 * to queue the remainder of them.
  	 */
@@ -1014,12 +1024,15 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
  {
  	int count;
  
-	/*
-	 * If sg table allocation fails, requeue request later.
-	 */
-	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
-					gfp_mask))) {
-		return BLKPREP_DEFER;
+	BUG_ON(req->nr_phys_segments > SCSI_MAX_SG_SEGMENTS);
+
+	if (!req->mq_ctx) {
+		/*
+		 * If sg table allocation fails, requeue request later.
+		 */
+		if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
+						gfp_mask)))
+			return BLKPREP_DEFER;
  	}
  
  	req->buffer = NULL;
@@ -1075,9 +1088,11 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
  		BUG_ON(prot_sdb == NULL);
  		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
  
-		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
-			error = BLKPREP_DEFER;
-			goto err_exit;
+		if (!rq->mq_ctx) {
+			if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
+				error = BLKPREP_DEFER;
+				goto err_exit;
+			}
  		}
  
  		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1505,7 +1520,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
  	blk_complete_request(req);
  }
  
-static void scsi_softirq_done(struct request *rq)
+void scsi_softirq_done(struct request *rq)
  {
  	struct scsi_cmnd *cmd = rq->special;
  	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
@@ -1533,9 +1548,11 @@ static void scsi_softirq_done(struct request *rq)
  			scsi_finish_command(cmd);
  			break;
  		case NEEDS_RETRY:
+			WARN_ON(rq->mq_ctx);
  			scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
  			break;
  		case ADD_TO_MLQUEUE:
+			WARN_ON(rq->mq_ctx);
  			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
  			break;
  		default:
@@ -1668,6 +1685,120 @@ out_delay:
  		blk_delay_queue(q, SCSI_QUEUE_DELAY);
  }
  
+static int scsi_mq_prep_fn(struct request *req)
+{
+	struct scsi_cmnd *cmd = req->special;
+	int ret;
+
+	ret = scsi_prep_state_check(cmd->device, req);
+	if (ret != BLKPREP_OK)
+		goto out;
+
+	if (req->cmd_type == REQ_TYPE_FS)
+		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
+	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+		ret = scsi_setup_blk_pc_cmnd(cmd->device, req);
+	else
+		ret = BLKPREP_KILL;
+
+out:
+	switch (ret) {
+	case BLKPREP_OK:
+		return 0;
+	case BLKPREP_DEFER:
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	default:
+		req->errors = DID_NO_CONNECT << 16;
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+}
+
+static int scsi_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct scsi_device *sdev = q->queuedata;
+	struct Scsi_Host *shost = sdev->host;
+	struct scsi_cmnd *cmd = rq->special;
+	unsigned char *sense_buf = cmd->sense_buffer;
+	struct scatterlist *sg;
+	int ret = BLK_MQ_RQ_QUEUE_BUSY;
+	int reason;
+
+	/*
+	 * blk-mq stores this in the mq_ctx, which can't be derferenced by
+	 * drivers.  For now use the old per-request field, but there must be
+	 * a better way.
+	 */
+	rq->cpu = raw_smp_processor_id();
+
+	if (!get_device(&sdev->sdev_gendev))
+		goto out;
+
+	if (!scsi_dev_queue_ready(q, sdev))
+		goto out_put_device;
+	if (!scsi_target_queue_ready(shost, sdev))
+		goto out_dec_device_busy;
+	if (!scsi_host_queue_ready(q, shost, sdev))
+		goto out_dec_target_busy;
+
+	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
+	memset(cmd, 0, sizeof(struct scsi_cmnd));
+
+	cmd->request = rq;
+	cmd->device = sdev;
+	cmd->sense_buffer = sense_buf;
+
+	cmd->tag = rq->tag;
+	cmd->cmnd = rq->cmd;
+	cmd->prot_op = SCSI_PROT_NORMAL;
+
+	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
+
+	if (rq->nr_phys_segments) {
+		cmd->sdb.table.sgl = sg;
+		cmd->sdb.table.nents = rq->nr_phys_segments;
+		sg_init_table(cmd->sdb.table.sgl, rq->nr_phys_segments);
+	}
+
+	if (scsi_host_get_prot(shost)) {
+		cmd->prot_sdb = (void *)sg +
+			shost->sg_tablesize * sizeof(struct scatterlist);
+		memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
+
+		cmd->prot_sdb->table.sgl =
+			(struct scatterlist *)(cmd->prot_sdb + 1);
+	}
+
+	ret = scsi_mq_prep_fn(rq);
+	if (ret)
+		goto out_dec_host_busy;
+
+	scsi_init_cmd_errh(cmd);
+
+	reason = scsi_dispatch_cmd(cmd);
+	if (reason) {
+		scsi_set_blocked(cmd, reason);
+		goto out_uninit;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+
+out_uninit:
+	if (rq->cmd_type == REQ_TYPE_FS)
+		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
+out_dec_host_busy:
+	atomic_dec(&shost->host_busy);
+out_dec_target_busy:
+	atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+	atomic_dec(&sdev->device_busy);
+	/* XXX: delay queue if device_busy == 0 */
+out_put_device:
+	put_device(&sdev->sdev_gendev);
+out:
+	return ret;
+}
+
  u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
  {
  	struct device *host_dev;
@@ -1754,6 +1885,99 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
  	return q;
  }
  
+static struct blk_mq_ops scsi_mq_ops = {
+	.queue_rq	= scsi_mq_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
+};
+
+struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	struct request *rq;
+	struct scsi_cmnd *cmd;
+	struct blk_mq_reg reg;
+	int i, j, sgl_size;
+
+	memset(&reg, 0, sizeof(reg));
+	reg.ops = &scsi_mq_ops;
+	reg.queue_depth = shost->cmd_per_lun;
+	if (!reg.queue_depth)
+		reg.queue_depth = 1;
+
+	/* XXX: what to do about chained S/G lists? */
+	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
+		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
+	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
+
+	reg.cmd_size = sizeof(struct scsi_cmnd) +
+			sgl_size +
+			shost->hostt->cmd_size;
+	if (scsi_host_get_prot(shost))
+		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;
+	reg.numa_node = NUMA_NO_NODE;
+	reg.nr_hw_queues = 1;

Hey Christoph,

I just started to look at mq on Nic's WIP branch. I have a pretty basic 
question.

Both you and Nic offer a single HW queue per sdev.
I'm wandering if that should be the LLD's decision (if chooses to use 
multiple queues)?

Trying to understand how LLDs will fit in a way they exploit multi-queue 
and actually
maintain multiple queues. SRP/iSER for example maintain a single queue 
per connection
(or session in iSCSI). Now with multi-queue all requests of that shost 
will eventually
boil-down to posting on a single queue which might transition the 
bottleneck to the LLDs.

I noticed virtio_scsi implementation is choosing a queue per command 
based on current
processor id without any explicit mapping (unless I missed it).

I guess my question is where do (or should) LLDs plug-in to this mq scheme?

Thanks,
Sagi.

+	reg.flags = BLK_MQ_F_SHOULD_MERGE;
+
+	q = blk_mq_init_queue(&reg, sdev);
+	if (IS_ERR(q)) {
+		printk("blk_mq_init_queue failed\n");
+		return NULL;
+	}
+
+	blk_queue_prep_rq(q, scsi_prep_fn);
+	sdev->request_queue = q;
+	q->queuedata = sdev;
+
+	__scsi_init_queue(shost, q);
+
+	/*
+	 * XXX: figure out if we can get alignment right to allocate the sense
+	 * buffer with the other chunks of memory.
+	 *
+	 * If not we'll need to find a way to have the blk-mq core call us to
+	 * allocate/free commands so that we can properly clean up the
+	 * allocation instead of leaking it.
+	 */
+	queue_for_each_hw_ctx(q, hctx, i) {
+		for (j = 0; j < hctx->queue_depth; j++) {
+			rq = hctx->rqs[j];
+			cmd = rq->special;
+
+			cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
+					   GFP_KERNEL, reg.numa_node);
+			if (!cmd->sense_buffer)
+				goto out_free_sense_buffers;
+		}
+	}
+
+	rq = q->flush_rq;
+	cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
+					   GFP_KERNEL, reg.numa_node);
+	if (!cmd->sense_buffer)
+		goto out_free_sense_buffers;
+
+	return q;
+
+out_free_sense_buffers:
+	queue_for_each_hw_ctx(q, hctx, i) {
+		for (j = 0; j < hctx->queue_depth; j++) {
+			rq = hctx->rqs[j];
+			cmd = rq->special;
+
+			kfree(cmd->sense_buffer);
+		}
+	}
+
+	blk_cleanup_queue(q);
+	return NULL;
+}
+
  /*
   * Function:    scsi_block_requests()
   *
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index f079a59..712cec2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -88,8 +88,10 @@ extern void scsi_next_command(struct scsi_cmnd *cmd);
  extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
  extern void scsi_run_host_queues(struct Scsi_Host *shost);
  extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
+extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
  extern int scsi_init_queue(void);
  extern void scsi_exit_queue(void);
+extern void scsi_softirq_done(struct request *rq);
  struct request_queue;
  struct request;
  extern struct kmem_cache *scsi_sdb_cache;
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 307a811..c807bc2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -277,7 +277,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
  	 */
  	sdev->borken = 1;
  
-	sdev->request_queue = scsi_alloc_queue(sdev);
+	if (shost->hostt->use_blk_mq)
+		sdev->request_queue = scsi_mq_alloc_queue(sdev);
+	else
+		sdev->request_queue = scsi_alloc_queue(sdev);
  	if (!sdev->request_queue) {
  		/* release fn is set up in scsi_sysfs_device_initialise, so
  		 * have to free and put manually here */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index c4e4875..d2661cb 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -531,6 +531,9 @@ struct scsi_host_template {
  	 */
  	unsigned int cmd_size;
  	struct scsi_host_cmd_pool *cmd_pool;
+
+	/* temporary flag to use blk-mq I/O path */
+	bool use_blk_mq;
  };
  
  /*

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html