Re: [PATCH 4/4] scsi: core: don't limit per-LUN queue depth for SSD

Ming Lei <ming.lei@xxxxxxxxxx> · Fri, 22 Nov 2019 16:09:59 +0800

On Thu, Nov 21, 2019 at 04:45:48PM +0100, Hannes Reinecke wrote:
> On 11/21/19 1:53 AM, Ming Lei wrote:
> > On Wed, Nov 20, 2019 at 11:05:24AM +0100, Hannes Reinecke wrote:
> >> On 11/18/19 11:31 AM, Ming Lei wrote:
> >>> SCSI core uses the atomic variable of sdev->device_busy to track
> >>> in-flight IO requests dispatched to this scsi device. IO request may be
> >>> submitted from any CPU, so the cost for maintaining the shared atomic
> >>> counter can be very big on big NUMA machine with lots of CPU cores.
> >>>
> >>> sdev->queue_depth is usually used for two purposes: 1) improve IO merge;
> >>> 2) fair IO request scattered among all LUNs.
> >>>
> >>> blk-mq already provides fair request allocation among all active shared
> >>> request queues(LUNs), see hctx_may_queue().
> >>>
> >>> NVMe doesn't have such per-request-queue(namespace) queue depth, so it
> >>> is reasonable to ignore the limit for SCSI SSD too. Also IO merge won't
> >>> play big role for reaching top SSD performance.
> >>>
> >>> With this patch, big cost for tracking in-flight per-LUN requests via
> >>> atomic variable can be saved.
> >>>
> >>> Given QUEUE_FLAG_NONROT is read in IO path, we have to freeze queue
> >>> before changing this flag.
> >>>
> >>> Cc: Sathya Prakash <sathya.prakash@xxxxxxxxxxxx>
> >>> Cc: Chaitra P B <chaitra.basappa@xxxxxxxxxxxx>
> >>> Cc: Suganath Prabu Subramani <suganath-prabu.subramani@xxxxxxxxxxxx>
> >>> Cc: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx>
> >>> Cc: Sumit Saxena <sumit.saxena@xxxxxxxxxxxx>
> >>> Cc: Shivasharan S <shivasharan.srikanteshwara@xxxxxxxxxxxx>
> >>> Cc: Ewan D. Milne <emilne@xxxxxxxxxx>
> >>> Cc: Christoph Hellwig <hch@xxxxxx>,
> >>> Cc: Hannes Reinecke <hare@xxxxxxx>
> >>> Cc: Bart Van Assche <bart.vanassche@xxxxxxx>
> >>> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> >>> ---
> >>>  block/blk-sysfs.c       | 14 +++++++++++++-
> >>>  drivers/scsi/scsi_lib.c | 24 ++++++++++++++++++------
> >>>  drivers/scsi/sd.c       |  4 ++++
> >>>  3 files changed, 35 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> >>> index fca9b158f4a0..9cc0dd5f88a1 100644
> >>> --- a/block/blk-sysfs.c
> >>> +++ b/block/blk-sysfs.c
> >>> @@ -281,6 +281,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
> >>>  QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
> >>>  #undef QUEUE_SYSFS_BIT_FNS
> >>>  
> >>> +static ssize_t queue_store_nonrot_freeze(struct request_queue *q,
> >>> +		const char *page, size_t count)
> >>> +{
> >>> +	size_t ret;
> >>> +
> >>> +	blk_mq_freeze_queue(q);
> >>> +	ret = queue_store_nonrot(q, page, count);
> >>> +	blk_mq_unfreeze_queue(q);
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +
> >>>  static ssize_t queue_zoned_show(struct request_queue *q, char *page)
> >>>  {
> >>>  	switch (blk_queue_zoned_model(q)) {
> >>> @@ -642,7 +654,7 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
> >>>  static struct queue_sysfs_entry queue_nonrot_entry = {
> >>>  	.attr = {.name = "rotational", .mode = 0644 },
> >>>  	.show = queue_show_nonrot,
> >>> -	.store = queue_store_nonrot,
> >>> +	.store = queue_store_nonrot_freeze,
> >>>  };
> >>>  
> >>>  static struct queue_sysfs_entry queue_zoned_entry = {
> >>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> >>> index 62a86a82c38d..72655a049efd 100644
> >>> --- a/drivers/scsi/scsi_lib.c
> >>> +++ b/drivers/scsi/scsi_lib.c
> >>> @@ -354,7 +354,8 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
> >>>  	if (starget->can_queue > 0)
> >>>  		atomic_dec(&starget->target_busy);
> >>>  
> >>> -	atomic_dec(&sdev->device_busy);
> >>> +	if (!blk_queue_nonrot(sdev->request_queue))
> >>> +		atomic_dec(&sdev->device_busy);
> >>>  }
> >>>  
> >>>  static void scsi_kick_queue(struct request_queue *q)
> >>> @@ -410,7 +411,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
> >>>  
> >>>  static inline bool scsi_device_is_busy(struct scsi_device *sdev)
> >>>  {
> >>> -	if (atomic_read(&sdev->device_busy) >= sdev->queue_depth)
> >>> +	if (!blk_queue_nonrot(sdev->request_queue) &&
> >>> +			atomic_read(&sdev->device_busy) >= sdev->queue_depth)
> >>>  		return true;
> >>>  	if (atomic_read(&sdev->device_blocked) > 0)
> >>>  		return true;
> >>> @@ -1283,8 +1285,12 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
> >>>  				  struct scsi_device *sdev)
> >>>  {
> >>>  	unsigned int busy;
> >>> +	bool bypass = blk_queue_nonrot(sdev->request_queue);
> >>>  
> >>> -	busy = atomic_inc_return(&sdev->device_busy) - 1;
> >>> +	if (!bypass)
> >>> +		busy = atomic_inc_return(&sdev->device_busy) - 1;
> >>> +	else
> >>> +		busy = 0;
> >>>  	if (atomic_read(&sdev->device_blocked)) {
> >>>  		if (busy)
> >>>  			goto out_dec;
> >>> @@ -1298,12 +1304,16 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
> >>>  				   "unblocking device at zero depth\n"));
> >>>  	}
> >>>  
> >>> +	if (bypass)
> >>> +		return 1;
> >>> +
> >>>  	if (busy >= sdev->queue_depth)
> >>>  		goto out_dec;
> >>>  
> >>>  	return 1;
> >>>  out_dec:
> >>> -	atomic_dec(&sdev->device_busy);
> >>> +	if (!bypass)
> >>> +		atomic_dec(&sdev->device_busy);
> >>>  	return 0;
> >>>  }
> >>>  
> >>> @@ -1624,7 +1634,8 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
> >>>  	struct request_queue *q = hctx->queue;
> >>>  	struct scsi_device *sdev = q->queuedata;
> >>>  
> >>> -	atomic_dec(&sdev->device_busy);
> >>> +	if (!blk_queue_nonrot(sdev->request_queue))
> >>> +		atomic_dec(&sdev->device_busy);
> >>>  }
> >>>  
> >>>  static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
> >>> @@ -1731,7 +1742,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>>  	case BLK_STS_OK:
> >>>  		break;
> >>>  	case BLK_STS_RESOURCE:
> >>> -		if (atomic_read(&sdev->device_busy) ||
> >>> +		if ((!blk_queue_nonrot(sdev->request_queue) &&
> >>> +		     atomic_read(&sdev->device_busy)) ||
> >>>  		    scsi_device_blocked(sdev))
> >>>  			ret = BLK_STS_DEV_RESOURCE;
> >>>  		break;
> >>> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> >>> index 0744c34468e1..c3d47117700d 100644
> >>> --- a/drivers/scsi/sd.c
> >>> +++ b/drivers/scsi/sd.c
> >>> @@ -2927,7 +2927,9 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
> >>>  	rot = get_unaligned_be16(&buffer[4]);
> >>>  
> >>>  	if (rot == 1) {
> >>> +		blk_mq_freeze_queue(q);
> >>>  		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
> >>> +		blk_mq_unfreeze_queue(q);
> >>>  		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
> >>>  	}
> >>>  
> >>> @@ -3123,7 +3125,9 @@ static int sd_revalidate_disk(struct gendisk *disk)
> >>>  		 * cause this to be updated correctly and any device which
> >>>  		 * doesn't support it should be treated as rotational.
> >>>  		 */
> >>> +		blk_mq_freeze_queue(q);
> >>>  		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
> >>> +		blk_mq_unfreeze_queue(q);
> >>>  		blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q);
> >>>  
> >>>  		if (scsi_device_supports_vpd(sdp)) {
> >>>
> >> Hmm.
> >>
> >> I must admit I patently don't like this explicit dependency on
> >> blk_nonrot(). Having a conditional counter is just an open invitation to
> >> getting things wrong...
> > 
> > That won't be an issue given this patch freezes queue when the
> > NONROT queue flag is changed.
> > 
> That's not what I'm saying.
> 
> My issue is that we're turning the device_busy counter into a
> conditional counter, which only must be accessed when that condition is
> true.
> This not only puts a burden on the developer (as he has to remember to
> always check the condition), but also has possible issues when switching
> between modes.

The conditional check can be avoided only if we can kill device_busy
completely. However, as Martin commented, that isn't realistic so far.

> The latter you've addressed with ensuring that the queue must be frozen
> when modifying that flag, but the former is a conceptual problem.

It can't be perfect, but it can be good by the latter.

Someone said 'Perfect is the enemy of the good', :-)

> 
> And that was what I had been referring to.
> 
> >>
> >> I would far prefer if we could delegate any queueing decision to the
> >> elevators, and completely drop the device_busy flag for all devices.
> > 
> > If you drop it, you may create big sequential IO performance drop
> > on HDD., that is why this patch only bypasses sdev->queue_depth on
> > SSD. NVMe bypasses it because no one uses HDD. via NVMe.
> > 
> I still wonder how much performance drop we actually see; what seems to
> happen is that device_busy just arbitrary pushes back to the block
> layer, giving it more time to do merging.
> I do think we can do better then that...

For example, running the following script[1] on 4-core VM:

------------------------------------------
                    | QD:255    | QD: 32  | 
------------------------------------------
fio read throughput | 825MB/s   | 1432MB/s|   
------------------------------------------

note:

QD: scsi_device's queue depth, which can be passed to test.sh.
scsi debug's device queue depth is 255 at default, which is >= .can_queue

[1] test.sh

#!/bin/bash
QD=$1

modprobe scsi_debug ndelay=200000
sleep 2
DEVN=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename`
DEV="/dev/"$DEVN

echo $QD > /sys/block/$DEVN/device/queue_depth
SQD=`cat /sys/block/$DEVN/device/queue_depth`
echo "scsi device queue depth: $QD"

fio --readwrite=read --filename=$DEV --runtime=20s --time_based --group_reporting=1 \
    --ioengine=libaio --direct=1 --iodepth=64 --bs=4k --numjobs=4 --name=seq_perf

Thanks,
Ming