On 28/02/2020 09:33, Ming Lei wrote:
scsi host template struct is quite big, and the following three
fields are needed in SCSI IO path:
- queuecommand
- commit_rqs
- cmd_size
Would it have been nearly as good to reorganise Scsi host template
structure to ensure that these are adjacent?
I say nearly, as it avoids the shost->hostt read.
Cache them into scsi host intance, so that we can avoid to fetch
big scsi host template instance in IO path.
40% IOPS boost can be observed in my scsi_debug performance test after
applying this change.
Cc: Sumanesh Samanta <sumanesh.samanta@xxxxxxxxxxxx>
Cc: Ewan D . Milne <emilne@xxxxxxxxxx>
Cc: Hannes Reinecke <hare@xxxxxxx>
Cc: Bart Van Assche <bart.vanassche@xxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
drivers/scsi/hosts.c | 4 ++++
drivers/scsi/scsi_lib.c | 10 +++++-----
include/scsi/scsi_host.h | 11 ++++++++++-
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 1d669e47b692..8012c1db092e 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -466,6 +466,10 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
if (sht->virt_boundary_mask)
shost->virt_boundary_mask = sht->virt_boundary_mask;
+ shost->cmd_size = sht->cmd_size;
+ shost->queuecommand = sht->queuecommand;
+ shost->commit_rqs = sht->commit_rqs;
+
device_initialize(&shost->shost_gendev);
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e7fbf3a9a6aa..f4243ae1d4a9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1148,7 +1148,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
/* zero out the cmd, except for the embedded scsi_request */
memset((char *)cmd + sizeof(cmd->req), 0,
- sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
+ sizeof(*cmd) - sizeof(cmd->req) + dev->host->cmd_size);
cmd->device = dev;
cmd->sense_buffer = buf;
@@ -1547,7 +1547,7 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
}
trace_scsi_dispatch_cmd_start(cmd);
- rtn = host->hostt->queuecommand(host, cmd);
+ rtn = host->queuecommand(host, cmd);
if (rtn) {
trace_scsi_dispatch_cmd_error(cmd, rtn);
if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
@@ -1584,7 +1584,7 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
cmd->tag = req->tag;
cmd->prot_op = SCSI_PROT_NORMAL;
- sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
+ sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->cmd_size;
cmd->sdb.table.sgl = sg;
if (scsi_host_get_prot(shost)) {
@@ -1752,7 +1752,7 @@ static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
if (scsi_host_get_prot(shost)) {
sg = (void *)cmd + sizeof(struct scsi_cmnd) +
- shost->hostt->cmd_size;
+ shost->cmd_size;
cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
}
@@ -1844,7 +1844,7 @@ static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
- shost->hostt->commit_rqs(shost, hctx->queue_num);
+ shost->commit_rqs(shost, hctx->queue_num);
}
static const struct blk_mq_ops scsi_mq_ops = {
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index f577647bf5f2..ccd5b9a5de2a 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -522,7 +522,16 @@ struct Scsi_Host {
*/
struct list_head __devices;
struct list_head __targets;
-
+
+ /*
+ * cacahe the three fields from scsi_host_template, so that
+ * the big host template instance needn't to be fetched in
+ * IO path. Big IOPS boost can be observed by this way.
+ */
+ unsigned int cmd_size;
any reason not to add cacheline alignment? And I think function pointers
will be at least same size as unsigned int, so it may make more sense to
put cmd_size last.
+ int (*queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
+ void (*commit_rqs)(struct Scsi_Host *, u16);
+
struct list_head starved_list;
spinlock_t default_lock;