scsi uses one global atomic variable to track queue depth for each LUN/request queue. This way doesn't scale well when there is lots of CPU cores and the disk is very fast. It has been observed that IOPS is affected a lot by tracking queue depth via sdev->device_busy in IO path. So replace the atomic variable sdev->device_busy with sbitmap for tracking scsi device queue depth. Test shows that IOPS difference is just ~1% compared with bypassing device queue depth on scsi_debug by applying patches [1]. Meantime IOPS is improved > 20% compared with linus tree. Follows test steps: 1) test machine(32 logical CPU cores) Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 Model name: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz 2) setup scsi_debug: modprobe scsi_debug virtual_gb=128 max_luns=1 submit_queues=32 delay=0 max_queue=256 3) fio script: fio --rw=randread --size=128G --direct=1 --ioengine=libaio --iodepth=2048 \ --numjobs=32 --bs=4k --group_reporting=1 --group_reporting=1 --runtime=60 \ --loops=10000 --name=job1 --filename=/dev/sdN [1] https://lore.kernel.org/linux-block/20200119071432.18558-6-ming.lei@xxxxxxxxxx/ Cc: Omar Sandoval <osandov@xxxxxx> Cc: Sathya Prakash <sathya.prakash@xxxxxxxxxxxx> Cc: Chaitra P B <chaitra.basappa@xxxxxxxxxxxx> Cc: Suganath Prabu Subramani <suganath-prabu.subramani@xxxxxxxxxxxx> Cc: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx> Cc: Sumit Saxena <sumit.saxena@xxxxxxxxxxxx> Cc: Shivasharan S <shivasharan.srikanteshwara@xxxxxxxxxxxx> Cc: Ewan D. Milne <emilne@xxxxxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Bart Van Assche <bart.vanassche@xxxxxxx> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- drivers/scsi/scsi.c | 2 ++ drivers/scsi/scsi_lib.c | 37 ++++++++++++++++++------------------- drivers/scsi/scsi_priv.h | 1 + drivers/scsi/scsi_scan.c | 21 +++++++++++++++++++-- drivers/scsi/scsi_sysfs.c | 2 ++ include/scsi/scsi_cmnd.h | 2 ++ include/scsi/scsi_device.h | 5 +++-- 7 files changed, 47 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 930e4803d888..5a97e0967e52 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -245,6 +245,8 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); + sbitmap_resize(&sdev->budget_map, sdev->queue_depth); + return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index a3474b418602..e7fbf3a9a6aa 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -354,7 +354,7 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) if (starget->can_queue > 0) atomic_dec(&starget->target_busy); - atomic_dec(&sdev->device_busy); + sbitmap_put(&sdev->budget_map, cmd->budget_token); } static void scsi_kick_queue(struct request_queue *q) @@ -1274,19 +1274,17 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) } /* - * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else - * return 0. - * - * Called with the queue_lock held. + * scsi_dev_queue_ready: if we can send requests to sdev, assign one token + * and return the token else return -1. */ static inline int scsi_dev_queue_ready(struct request_queue *q, struct scsi_device *sdev) { - unsigned int busy; + int token; - busy = atomic_inc_return(&sdev->device_busy) - 1; + token = sbitmap_get(&sdev->budget_map); if (atomic_read(&sdev->device_blocked)) { - if (busy) + if (token >= 0) goto out_dec; /* @@ -1298,13 +1296,11 @@ static inline int scsi_dev_queue_ready(struct request_queue *q, "unblocking device at zero depth\n")); } - if (busy >= sdev->queue_depth) - goto out_dec; - - return 1; + return token; out_dec: - atomic_dec(&sdev->device_busy); - return 0; + if (token >= 0) + sbitmap_put(&sdev->budget_map, token); + return -1; } /* @@ -1624,16 +1620,17 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx, int budget_token) struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - atomic_dec(&sdev->device_busy); + sbitmap_put(&sdev->budget_map, budget_token); } static int scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; + int token = scsi_dev_queue_ready(q, sdev); - if (scsi_dev_queue_ready(q, sdev)) - return 0; + if (token >= 0) + return token; if (scsi_device_busy(sdev) == 0 && !scsi_device_blocked(sdev)) blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); @@ -1677,6 +1674,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); } + cmd->budget_token = bd->budget_token; + cmd->flags &= SCMD_PRESERVED_FLAGS; if (sdev->simple_tags) cmd->flags |= SCMD_TAGGED; @@ -1701,12 +1700,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: - scsi_mq_put_budget(hctx, 0); + scsi_mq_put_budget(hctx, bd->budget_token); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: - if (atomic_read(&sdev->device_busy) || + if (sbitmap_any_bit_set(&sdev->budget_map) || scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index 3bff9f7aa684..b934686aee32 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -5,6 +5,7 @@ #include <linux/device.h> #include <linux/async.h> #include <scsi/scsi_device.h> +#include <linux/sbitmap.h> struct request_queue; struct request; diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 058079f915f1..27b64c82d26b 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -215,6 +215,7 @@ static void scsi_unlock_floptical(struct scsi_device *sdev, static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, u64 lun, void *hostdata) { + unsigned int depth; struct scsi_device *sdev; int display_failure_msg = 1, ret; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); @@ -277,8 +278,23 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, WARN_ON_ONCE(!blk_get_queue(sdev->request_queue)); sdev->request_queue->queuedata = sdev; - scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ? - sdev->host->cmd_per_lun : 1); + depth = sdev->host->cmd_per_lun ?: 1; + + /* + * Use .can_queue as budget map's depth because we have to + * support adjusting queue depth from sysfs. Meantime use + * default device queue depth to figure out sbitmap shift + * since we use this queue depth most of times. + */ + if (sbitmap_init_node(&sdev->budget_map, sdev->host->can_queue, + sbitmap_calculate_shift(depth), + GFP_KERNEL, sdev->request_queue->node, + false, true)) { + put_device(&starget->dev); + kfree(sdev); + } + + scsi_change_queue_depth(sdev, depth); scsi_sysfs_device_initialize(sdev); @@ -980,6 +996,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, scsi_attach_vpd(sdev); sdev->max_queue_depth = sdev->queue_depth; + WARN_ON_ONCE(sdev->max_queue_depth > sdev->budget_map.depth); sdev->sdev_bflags = *bflags; /* diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index d6cb5a0a03f2..835566b805fb 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -466,6 +466,8 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) /* NULL queue means the device can't be used */ sdev->request_queue = NULL; + sbitmap_free(&sdev->budget_map); + mutex_lock(&sdev->inquiry_mutex); vpd_pg0 = rcu_replace_pointer(sdev->vpd_pg0, vpd_pg0, lockdep_is_held(&sdev->inquiry_mutex)); diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index a2849bb9cd19..e6f750f43889 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -76,6 +76,8 @@ struct scsi_cmnd { int eh_eflags; /* Used by error handlr */ + int budget_token; + /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 09caf6f2f528..cdc1f28ec57f 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <scsi/scsi.h> #include <linux/atomic.h> +#include <linux/sbitmap.h> struct device; struct request_queue; @@ -106,7 +107,7 @@ struct scsi_device { struct list_head siblings; /* list of all devices on this host */ struct list_head same_target_siblings; /* just the devices sharing same target id */ - atomic_t device_busy; /* commands actually active on LLDD */ + struct sbitmap budget_map; atomic_t device_blocked; /* Device returned QUEUE_FULL. */ spinlock_t list_lock; @@ -586,7 +587,7 @@ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) static inline int scsi_device_busy(struct scsi_device *sdev) { - return atomic_read(&sdev->device_busy); + return sbitmap_weight(&sdev->budget_map); } #define MODULE_ALIAS_SCSI_DEVICE(type) \ -- 2.20.1