Re: [PATCH 5/5] block/mq-deadline: Remove zone locking

Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx> · Wed, 15 Jun 2022 08:40:08 +0900

On 6/15/22 02:49, Bart Van Assche wrote:
> Measurements have shown that limiting the queue depth to one has a
> significant negative performance impact on zoned UFS devices. Hence this
> patch that removes zone locking from the mq-deadline scheduler. This
> patch is based on the following assumptions:
> - Applications submit write requests to sequential write required zones
>   in order.
> - If such write requests get reordered by the software or hardware queue
>   mechanism, nr_hw_queues * nr_requests - 1 retries are sufficient to
>   reorder the write requests.

As mentioned in my previous reply for patch 2, AHCI will not behave like
that, at all. Retrying will be useless most of the time because the
adapter send commands to the drive randomly from the set of commands that
are marked ready in the ready register. So no. I am opposed to
unconditionally removing zone write locking.

If UFS LLD can deliver commands in order then use a queue flag to say
"zone write locking not needed" to disable it for that device class. There
probably are some SAS HBAs that could benefit from this too, but I have
seen so many reordering bugs with these (e.g. requeue at tail of a write
that got a TSF) that I would not want to remove zone write locking for these.

And I also do not want to start getting 10 calls a day from customers
complaining about very bad write performance due to all these retries
which are likely going to be slower in the end than writing at QD=1 per zone.

> - It happens infrequently that zoned write requests are reordered by the
>   block layer.

What make you say that ? It only takes 2 contexts trying to dispatch
commands to different queues. Or a write process being rescheduled to a
different CPU/queue.

> - Either no I/O scheduler is used or an I/O scheduler is used that
>   submits write requests per zone in LBA order.

And unfortunately, as the AHCI example shows, having the scheduler
dispatch requests in LBA order is not enough.

> 
> DD_BE_PRIO is selected for sequential writes to preserve the LBA order.

So if the application wanted the writes to have RT policy so that these
commands get the high priority bit set on SATA disks, that will not be
honored. No to that too.

> 
> See also commit 5700f69178e9 ("mq-deadline: Introduce zone locking
> support").
> 
> Cc: Damien Le Moal <damien.lemoal@xxxxxxx>
> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
> ---
>  block/mq-deadline.c | 74 ++++-----------------------------------------
>  1 file changed, 6 insertions(+), 68 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 6ed602b2f80a..e168fc9a980a 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -104,7 +104,6 @@ struct deadline_data {
>  	int prio_aging_expire;
>  
>  	spinlock_t lock;
> -	spinlock_t zone_lock;
>  };
>  
>  /* Maps an I/O priority class to a deadline scheduler priority. */
> @@ -285,30 +284,10 @@ static struct request *
>  deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
>  		      enum dd_data_dir data_dir)
>  {
> -	struct request *rq;
> -	unsigned long flags;
> -
>  	if (list_empty(&per_prio->fifo_list[data_dir]))
>  		return NULL;
>  
> -	rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
> -	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
> -		return rq;
> -
> -	/*
> -	 * Look for a write request that can be dispatched, that is one with
> -	 * an unlocked target zone.
> -	 */
> -	spin_lock_irqsave(&dd->zone_lock, flags);
> -	list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
> -		if (blk_req_can_dispatch_to_zone(rq))
> -			goto out;
> -	}
> -	rq = NULL;
> -out:
> -	spin_unlock_irqrestore(&dd->zone_lock, flags);
> -
> -	return rq;
> +	return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
>  }
>  
>  /*
> @@ -319,29 +298,7 @@ static struct request *
>  deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
>  		      enum dd_data_dir data_dir)
>  {
> -	struct request *rq;
> -	unsigned long flags;
> -
> -	rq = per_prio->next_rq[data_dir];
> -	if (!rq)
> -		return NULL;
> -
> -	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
> -		return rq;
> -
> -	/*
> -	 * Look for a write request that can be dispatched, that is one with
> -	 * an unlocked target zone.
> -	 */
> -	spin_lock_irqsave(&dd->zone_lock, flags);
> -	while (rq) {
> -		if (blk_req_can_dispatch_to_zone(rq))
> -			break;
> -		rq = deadline_latter_request(rq);
> -	}
> -	spin_unlock_irqrestore(&dd->zone_lock, flags);
> -
> -	return rq;
> +	return per_prio->next_rq[data_dir];
>  }
>  
>  /*
> @@ -467,10 +424,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>  	ioprio_class = dd_rq_ioclass(rq);
>  	prio = ioprio_class_to_prio[ioprio_class];
>  	dd->per_prio[prio].stats.dispatched++;
> -	/*
> -	 * If the request needs its target zone locked, do it.
> -	 */
> -	blk_req_zone_write_lock(rq);
>  	rq->rq_flags |= RQF_STARTED;
>  	return rq;
>  }
> @@ -640,7 +593,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
>  	dd->fifo_batch = fifo_batch;
>  	dd->prio_aging_expire = prio_aging_expire;
>  	spin_lock_init(&dd->lock);
> -	spin_lock_init(&dd->zone_lock);
>  
>  	q->elevator = eq;
>  	return 0;
> @@ -716,17 +668,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
>  	struct dd_per_prio *per_prio;
>  	enum dd_prio prio;
> +	bool seq_write = blk_rq_is_seq_write(rq);
>  	LIST_HEAD(free);
>  
>  	lockdep_assert_held(&dd->lock);
>  
> -	/*
> -	 * This may be a requeue of a write request that has locked its
> -	 * target zone. If it is the case, this releases the zone lock.
> -	 */
> -	blk_req_zone_write_unlock(rq);
> -
> -	prio = ioprio_class_to_prio[ioprio_class];
> +	prio = seq_write ? DD_BE_PRIO :
> +		ioprio_class_to_prio[ioprio_class];
>  	per_prio = &dd->per_prio[prio];
>  	if (!rq->elv.priv[0]) {
>  		per_prio->stats.inserted++;
> @@ -740,7 +688,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  
>  	trace_block_rq_insert(rq);
>  
> -	if (at_head) {
> +	if (at_head && !seq_write) {
>  		list_add(&rq->queuelist, &per_prio->dispatch);
>  		rq->fifo_time = jiffies;
>  	} else {
> @@ -819,16 +767,6 @@ static void dd_finish_request(struct request *rq)
>  		return;
>  
>  	atomic_inc(&per_prio->stats.completed);
> -
> -	if (blk_queue_is_zoned(q)) {
> -		unsigned long flags;
> -
> -		spin_lock_irqsave(&dd->zone_lock, flags);
> -		blk_req_zone_write_unlock(rq);
> -		if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
> -			blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
> -		spin_unlock_irqrestore(&dd->zone_lock, flags);
> -	}
>  }
>  
>  static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)

-- 
Damien Le Moal
Western Digital Research