Re: [PATCH 4/5] block: drain file system I/O on del_gendisk

Ming Lei <ming.lei@xxxxxxxxxx> · Wed, 29 Sep 2021 16:17:01 +0800

On Wed, Sep 29, 2021 at 09:12:40AM +0200, Christoph Hellwig wrote:
> Instead of delaying draining of file system I/O related items like the
> blk-qos queues, the integrity read workqueue and timeouts only when the
> request_queue is removed, do that when del_gendisk is called.  This is
> important for SCSI where the upper level drivers that control the gendisk
> are separate entities, and the disk can be freed much earlier than the
> request_queue, or can even be unbound without tearing down the queue.
> 
> Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk")
> Reported-by: Ming Lei <ming.lei@xxxxxxxxxx>
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> Tested-by: Darrick J. Wong <djwong@xxxxxxxxxx>
> ---
>  block/blk-core.c      | 27 ++++++++++++---------------
>  block/blk.h           |  1 +
>  block/genhd.c         | 21 +++++++++++++++++++++
>  include/linux/genhd.h |  1 +
>  4 files changed, 35 insertions(+), 15 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 43f5da707d8e3..4d8f5fe915887 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -49,7 +49,6 @@
>  #include "blk-mq.h"
>  #include "blk-mq-sched.h"
>  #include "blk-pm.h"
> -#include "blk-rq-qos.h"
>  
>  struct dentry *blk_debugfs_root;
>  
> @@ -337,23 +336,25 @@ void blk_put_queue(struct request_queue *q)
>  }
>  EXPORT_SYMBOL(blk_put_queue);
>  
> -void blk_set_queue_dying(struct request_queue *q)
> +void blk_queue_start_drain(struct request_queue *q)
>  {
> -	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
> -
>  	/*
>  	 * When queue DYING flag is set, we need to block new req
>  	 * entering queue, so we call blk_freeze_queue_start() to
>  	 * prevent I/O from crossing blk_queue_enter().
>  	 */
>  	blk_freeze_queue_start(q);
> -
>  	if (queue_is_mq(q))
>  		blk_mq_wake_waiters(q);
> -
>  	/* Make blk_queue_enter() reexamine the DYING flag. */
>  	wake_up_all(&q->mq_freeze_wq);
>  }
> +
> +void blk_set_queue_dying(struct request_queue *q)
> +{
> +	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
> +	blk_queue_start_drain(q);
> +}
>  EXPORT_SYMBOL_GPL(blk_set_queue_dying);
>  
>  /**
> @@ -385,13 +386,8 @@ void blk_cleanup_queue(struct request_queue *q)
>  	 */
>  	blk_freeze_queue(q);
>  
> -	rq_qos_exit(q);
> -
>  	blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
>  
> -	/* for synchronous bio-based driver finish in-flight integrity i/o */
> -	blk_flush_integrity();
> -
>  	blk_sync_queue(q);
>  	if (queue_is_mq(q))
>  		blk_mq_exit_queue(q);
> @@ -474,11 +470,12 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
>  
>  static inline int bio_queue_enter(struct bio *bio)
>  {
> -	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> +	struct gendisk *disk = bio->bi_bdev->bd_disk;
> +	struct request_queue *q = disk->queue;
>  
>  	while (!blk_try_enter_queue(q, false)) {
>  		if (bio->bi_opf & REQ_NOWAIT) {
> -			if (blk_queue_dying(q))
> +			if (test_bit(GD_DEAD, &disk->state))
>  				goto dead;
>  			bio_wouldblock_error(bio);
>  			return -EBUSY;
> @@ -495,8 +492,8 @@ static inline int bio_queue_enter(struct bio *bio)
>  		wait_event(q->mq_freeze_wq,
>  			   (!q->mq_freeze_depth &&
>  			    blk_pm_resume_queue(false, q)) ||
> -			   blk_queue_dying(q));
> -		if (blk_queue_dying(q))
> +			   test_bit(GD_DEAD, &disk->state));
> +		if (test_bit(GD_DEAD, &disk->state))
>  			goto dead;
>  	}
>  
> diff --git a/block/blk.h b/block/blk.h
> index 7d2a0ba7ed21d..e2ed2257709ae 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -51,6 +51,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
>  void blk_free_flush_queue(struct blk_flush_queue *q);
>  
>  void blk_freeze_queue(struct request_queue *q);
> +void blk_queue_start_drain(struct request_queue *q);
>  
>  #define BIO_INLINE_VECS 4
>  struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
> diff --git a/block/genhd.c b/block/genhd.c
> index 7b6e5e1cf9564..b3c33495d7208 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -26,6 +26,7 @@
>  #include <linux/badblocks.h>
>  
>  #include "blk.h"
> +#include "blk-rq-qos.h"
>  
>  static struct kobject *block_depr;
>  
> @@ -559,6 +560,8 @@ EXPORT_SYMBOL(device_add_disk);
>   */
>  void del_gendisk(struct gendisk *disk)
>  {
> +	struct request_queue *q = disk->queue;
> +
>  	might_sleep();
>  
>  	if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
> @@ -575,8 +578,26 @@ void del_gendisk(struct gendisk *disk)
>  	fsync_bdev(disk->part0);
>  	__invalidate_device(disk->part0, true);
>  
> +	/*
> +	 * Fail any new I/O.
> +	 */
> +	set_bit(GD_DEAD, &disk->state);
>  	set_capacity(disk, 0);
>  
> +	/*
> +	 * Prevent new I/O from crossing bio_queue_enter().
> +	 */
> +	blk_queue_start_drain(q);
> +	blk_mq_freeze_queue_wait(q);

Draining request won't fix the problem completely:

1) blk-mq dispatch code may still be in-progress after q_usage_counter
becomes zero, see the story in 662156641bc4 ("block: don't drain in-progress dispatch in
blk_cleanup_queue()")

2) elevator code / blkcg code may still be called after blk_cleanup_queue(), such
as kyber, trace_kyber_latency()(q->disk is referred) is called in kyber's timer
handler, and the timer is deleted via del_timer_sync() via kyber_exit_sched()
from blk_release_queue().

> +
> +	rq_qos_exit(q);
> +	blk_sync_queue(q);
> +	blk_flush_integrity();
> +	/*
> +	 * Allow using passthrough request again after the queue is torn down.
> +	 */
> +	blk_mq_unfreeze_queue(q);

Again, one FS bio is still possible to enter queue now: submit_bio_checks()
is done before set_capacity(0), and submitted after blk_mq_unfreeze_queue()
returns.

Thanks,
Ming