Re: [PATCH] md: use a separate bio_set for synchronous IO.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jun 21, 2017 at 09:12:21AM +1000, Neil Brown wrote:
> 
> md devices allocate a bio_set and use it for two
> distinct purposes.
> mddev->bio_set is used to clone bios as part of sending
> upper level requests down to lower level devices,
> and it is also use for synchronous IO such as superblock
> and bitmap updates, and for correcting read errors.
> 
> This multiple usage can lead to deadlocks.  It is likely
> that cloned bios might be queued for write and to be
> waiting for a metadata update before the write can be permitted.
> If the cloning exhausted mddev->bio_set, the metadata update
> may not be able to proceed.
> 
> This scenario has been seen during heavy testing, with lots of IO and
> lots of memory pressure.
> 
> Address this by adding a new bio_set specifically for synchronous IO.
> All synchronous IO goes directly to the underlying device and is not
> queued at the md level, so request using entries from the new
> mddev->sync_set will complete in a timely fashion.
> Requests that use mddev->bio_set will sometimes need to wait
> for synchronous IO, but will no longer risk deadlocking that iO.
> 
> Also: small simplification in mddev_put(): there is no need to
> wait until the spinlock is released before calling bioset_free().

applied, thanks
 
> Signed-off-by: NeilBrown <neilb@xxxxxxxx>
> ---
>  drivers/md/md.c | 27 ++++++++++++++++++++-------
>  drivers/md/md.h |  3 +++
>  2 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 87edc342ccb3..9d0eb50c458e 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
>  }
>  EXPORT_SYMBOL_GPL(bio_alloc_mddev);
>  
> +static struct bio *md_bio_alloc_sync(struct mddev *mddev)
> +{
> +	if (!mddev->sync_set)
> +		return bio_alloc(GFP_NOIO, 1);
> +
> +	return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
> +}
> +
>  /*
>   * We have a system wide 'event count' that is incremented
>   * on any 'interesting' event, and readers of /proc/mdstat
> @@ -462,8 +470,6 @@ static void mddev_delayed_delete(struct work_struct *ws);
>  
>  static void mddev_put(struct mddev *mddev)
>  {
> -	struct bio_set *bs = NULL;
> -
>  	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
>  		return;
>  	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> @@ -471,8 +477,12 @@ static void mddev_put(struct mddev *mddev)
>  		/* Array is not configured at all, and not held active,
>  		 * so destroy it */
>  		list_del_init(&mddev->all_mddevs);
> -		bs = mddev->bio_set;
> +		if (mddev->bio_set)
> +			bioset_free(mddev->bio_set);
> +		if (mddev->sync_set)
> +			bioset_free(mddev->sync_set);
>  		mddev->bio_set = NULL;
> +		mddev->sync_set = NULL;
>  		if (mddev->gendisk) {
>  			/* We did a probe so need to clean up.  Call
>  			 * queue_work inside the spinlock so that
> @@ -485,8 +495,6 @@ static void mddev_put(struct mddev *mddev)
>  			kfree(mddev);
>  	}
>  	spin_unlock(&all_mddevs_lock);
> -	if (bs)
> -		bioset_free(bs);
>  }
>  
>  static void md_safemode_timeout(unsigned long data);
> @@ -751,7 +759,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
>  	if (test_bit(Faulty, &rdev->flags))
>  		return;
>  
> -	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
> +	bio = md_bio_alloc_sync(mddev);
>  
>  	atomic_inc(&rdev->nr_pending);
>  
> @@ -783,7 +791,7 @@ int md_super_wait(struct mddev *mddev)
>  int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
>  		 struct page *page, int op, int op_flags, bool metadata_op)
>  {
> -	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
> +	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
>  	int ret;
>  
>  	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
> @@ -5432,6 +5440,11 @@ int md_run(struct mddev *mddev)
>  		if (!mddev->bio_set)
>  			return -ENOMEM;
>  	}
> +	if (mddev->sync_set == NULL) {
> +		mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0);
> +		if (!mddev->sync_set)
> +			return -ENOMEM;
> +	}
>  
>  	spin_lock(&pers_lock);
>  	pers = find_pers(mddev->level, mddev->clevel);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 0fa1de42c42b..a1c7aa89bb67 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -444,6 +444,9 @@ struct mddev {
>  	struct attribute_group		*to_remove;
>  
>  	struct bio_set			*bio_set;
> +	struct bio_set			*sync_set; /* for sync operations like
> +						   * metadata and bitmap writes
> +						   */
>  
>  	/* Generic flush handling.
>  	 * The last to finish preflush schedules a worker to submit
> -- 
> 2.12.2
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux