Re: [PATCH 4/4] scsi: core: don't limit per-LUN queue depth for SSD

Hannes Reinecke <hare@xxxxxxx> · Wed, 20 Nov 2019 11:05:24 +0100

On 11/18/19 11:31 AM, Ming Lei wrote:
> SCSI core uses the atomic variable of sdev->device_busy to track
> in-flight IO requests dispatched to this scsi device. IO request may be
> submitted from any CPU, so the cost for maintaining the shared atomic
> counter can be very big on big NUMA machine with lots of CPU cores.
> 
> sdev->queue_depth is usually used for two purposes: 1) improve IO merge;
> 2) fair IO request scattered among all LUNs.
> 
> blk-mq already provides fair request allocation among all active shared
> request queues(LUNs), see hctx_may_queue().
> 
> NVMe doesn't have such per-request-queue(namespace) queue depth, so it
> is reasonable to ignore the limit for SCSI SSD too. Also IO merge won't
> play big role for reaching top SSD performance.
> 
> With this patch, big cost for tracking in-flight per-LUN requests via
> atomic variable can be saved.
> 
> Given QUEUE_FLAG_NONROT is read in IO path, we have to freeze queue
> before changing this flag.
> 
> Cc: Sathya Prakash <sathya.prakash@xxxxxxxxxxxx>
> Cc: Chaitra P B <chaitra.basappa@xxxxxxxxxxxx>
> Cc: Suganath Prabu Subramani <suganath-prabu.subramani@xxxxxxxxxxxx>
> Cc: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx>
> Cc: Sumit Saxena <sumit.saxena@xxxxxxxxxxxx>
> Cc: Shivasharan S <shivasharan.srikanteshwara@xxxxxxxxxxxx>
> Cc: Ewan D. Milne <emilne@xxxxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>,
> Cc: Hannes Reinecke <hare@xxxxxxx>
> Cc: Bart Van Assche <bart.vanassche@xxxxxxx>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
>  block/blk-sysfs.c       | 14 +++++++++++++-
>  drivers/scsi/scsi_lib.c | 24 ++++++++++++++++++------
>  drivers/scsi/sd.c       |  4 ++++
>  3 files changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index fca9b158f4a0..9cc0dd5f88a1 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -281,6 +281,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
>  QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
>  #undef QUEUE_SYSFS_BIT_FNS
>  
> +static ssize_t queue_store_nonrot_freeze(struct request_queue *q,
> +		const char *page, size_t count)
> +{
> +	size_t ret;
> +
> +	blk_mq_freeze_queue(q);
> +	ret = queue_store_nonrot(q, page, count);
> +	blk_mq_unfreeze_queue(q);
> +
> +	return ret;
> +}
> +
>  static ssize_t queue_zoned_show(struct request_queue *q, char *page)
>  {
>  	switch (blk_queue_zoned_model(q)) {
> @@ -642,7 +654,7 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
>  static struct queue_sysfs_entry queue_nonrot_entry = {
>  	.attr = {.name = "rotational", .mode = 0644 },
>  	.show = queue_show_nonrot,
> -	.store = queue_store_nonrot,
> +	.store = queue_store_nonrot_freeze,
>  };
>  
>  static struct queue_sysfs_entry queue_zoned_entry = {
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 62a86a82c38d..72655a049efd 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -354,7 +354,8 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
>  	if (starget->can_queue > 0)
>  		atomic_dec(&starget->target_busy);
>  
> -	atomic_dec(&sdev->device_busy);
> +	if (!blk_queue_nonrot(sdev->request_queue))
> +		atomic_dec(&sdev->device_busy);
>  }
>  
>  static void scsi_kick_queue(struct request_queue *q)
> @@ -410,7 +411,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
>  
>  static inline bool scsi_device_is_busy(struct scsi_device *sdev)
>  {
> -	if (atomic_read(&sdev->device_busy) >= sdev->queue_depth)
> +	if (!blk_queue_nonrot(sdev->request_queue) &&
> +			atomic_read(&sdev->device_busy) >= sdev->queue_depth)
>  		return true;
>  	if (atomic_read(&sdev->device_blocked) > 0)
>  		return true;
> @@ -1283,8 +1285,12 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
>  				  struct scsi_device *sdev)
>  {
>  	unsigned int busy;
> +	bool bypass = blk_queue_nonrot(sdev->request_queue);
>  
> -	busy = atomic_inc_return(&sdev->device_busy) - 1;
> +	if (!bypass)
> +		busy = atomic_inc_return(&sdev->device_busy) - 1;
> +	else
> +		busy = 0;
>  	if (atomic_read(&sdev->device_blocked)) {
>  		if (busy)
>  			goto out_dec;
> @@ -1298,12 +1304,16 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
>  				   "unblocking device at zero depth\n"));
>  	}
>  
> +	if (bypass)
> +		return 1;
> +
>  	if (busy >= sdev->queue_depth)
>  		goto out_dec;
>  
>  	return 1;
>  out_dec:
> -	atomic_dec(&sdev->device_busy);
> +	if (!bypass)
> +		atomic_dec(&sdev->device_busy);
>  	return 0;
>  }
>  
> @@ -1624,7 +1634,8 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
>  	struct request_queue *q = hctx->queue;
>  	struct scsi_device *sdev = q->queuedata;
>  
> -	atomic_dec(&sdev->device_busy);
> +	if (!blk_queue_nonrot(sdev->request_queue))
> +		atomic_dec(&sdev->device_busy);
>  }
>  
>  static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
> @@ -1731,7 +1742,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	case BLK_STS_OK:
>  		break;
>  	case BLK_STS_RESOURCE:
> -		if (atomic_read(&sdev->device_busy) ||
> +		if ((!blk_queue_nonrot(sdev->request_queue) &&
> +		     atomic_read(&sdev->device_busy)) ||
>  		    scsi_device_blocked(sdev))
>  			ret = BLK_STS_DEV_RESOURCE;
>  		break;
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 0744c34468e1..c3d47117700d 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -2927,7 +2927,9 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
>  	rot = get_unaligned_be16(&buffer[4]);
>  
>  	if (rot == 1) {
> +		blk_mq_freeze_queue(q);
>  		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
> +		blk_mq_unfreeze_queue(q);
>  		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
>  	}
>  
> @@ -3123,7 +3125,9 @@ static int sd_revalidate_disk(struct gendisk *disk)
>  		 * cause this to be updated correctly and any device which
>  		 * doesn't support it should be treated as rotational.
>  		 */
> +		blk_mq_freeze_queue(q);
>  		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
> +		blk_mq_unfreeze_queue(q);
>  		blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q);
>  
>  		if (scsi_device_supports_vpd(sdp)) {
> 
Hmm.

I must admit I patently don't like this explicit dependency on
blk_nonrot(). Having a conditional counter is just an open invitation to
getting things wrong...

I would far prefer if we could delegate any queueing decision to the
elevators, and completely drop the device_busy flag for all devices.
But I do see that this might not be easy to do, nor necessarily
something which others agree on.
So alternatively, can't we make this a per-host flag, instead of having
it per device characteristic?

I'm just thinking of a storage array with a massive cache; it's anyone's
guess if 'nonrot' is set there, but at the same time these things can be
fast, so they might be profiting from dropping the device_busy counter
there, too.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      Teamlead Storage & Networking
hare@xxxxxxx			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 247165 (AG München), GF: Felix Imendörffer