Re: [RFC PATCH V2] MD: fix lock contention for flush bios

Shaohua Li <shli@xxxxxxxxxx> · Sun, 25 Feb 2018 11:11:43 -0800

On Wed, Feb 21, 2018 at 12:56:48PM +0800, Xiao Ni wrote:
> There is a lock contention when there are many processes which send flush bios
> to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.
> 
> Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
> to be NULL, otherwise get mddev->lock.
> 
> This patch remove mddev->flush_bio and handle flush bio asynchronously.
> I did a test with command dbench -s 128 -t 4800. This is the test result:
> 
> =================Without the patch============================
>  Operation      Count    AvgLat    MaxLat
>  ----------------------------------------
>  NTCreateX     129643     4.792 13249.555
>  Close         101064     0.884 12500.318
>  Rename          5498     2.364  8250.734
>  Unlink         21305     6.728  9749.328
>  Qpathinfo     118012     0.041   999.884
>  Qfileinfo      26676     0.004     0.261
>  Qfsinfo        19712     0.009     0.490
>  Sfileinfo      12247     0.014     0.614
>  Find           43408     0.025     0.528
>  WriteX         90174  6622.484 66249.763
>  ReadX         194811     0.022  2748.778
>  LockX            392     0.007     0.041
>  UnlockX          392     0.003     0.019
>  Flush           9970  1694.938 37749.263
> 
> Throughput 1.05396 MB/sec (sync open)  128 clients  128 procs  max_latency=66249.775 ms
> 
> =====================With the patch===========================
>  Operation      Count    AvgLat    MaxLat
>   ----------------------------------------
>   NTCreateX     903706     3.345  4500.722
>   Close         667882     0.128  4500.736
>   Rename         38387     0.566   750.312
>   Unlink        178731    13.415  5249.599
>   Qpathinfo     824087     1.129  4500.394
>   Qfileinfo     145055     0.004     1.010
>   Qfsinfo       147967     0.011     1.565
>   Sfileinfo      74359     0.023     1.439
>   Find          315728     0.096  2748.048
>   WriteX        447580  1303.313  4000.197
>   ReadX        1420065     0.010     3.587
>   LockX           3012     0.006     0.415
>   UnlockX         3012     0.004     0.067
>   Flush          62019   398.955  1251.283
> 
>  Throughput 5.8709 MB/sec (sync open)  128 clients  128 procs  max_latency=5249.611 ms
> 
> V2:
> Neil pointed out two problems. One is counting error problem and another is return value
> when allocat memory fails.
> 1. counting error problem
> This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
> that you previously called
>                           atomic_inc(&rdev->nr_pending);
> If an rdev was added to the list between the start and end of the flush,
> this will do something bad.
> 
> Now it doesn't use bio_chain. It uses specified call back function for each
> flush bio.
> 2. Returned on IO error when kmalloc fails is wrong.
> I use mempool suggested by Neil in V2
> 3. Fixed some places pointed by Guoqing
> 
>  The test was did on one raid10 with 4 SSDs.
> 
> Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
> Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
> ---
>  drivers/md/md.c | 152 ++++++++++++++++++++++++++++++++++++--------------------
>  drivers/md/md.h |  22 +++++---
>  2 files changed, 112 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0a9ed51..60c2c17 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
>  		mddev->sync_speed_max : sysctl_speed_limit_max;
>  }
>  
> +static void * flush_info_alloc(gfp_t gfp_flags, void *data)
> +{
> +        return kzalloc(sizeof(struct flush_info), gfp_flags);
> +}
> +static void flush_info_free(void *flush_info, void *data)
> +{
> +        kfree(flush_info);
> +}
> +
> +static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
> +{
> +	return kzalloc(sizeof(struct flush_bio), gfp_flags);
> +}
> +static void flush_bio_free(void *flush_bio, void *data)
> +{
> +	kfree(flush_bio);
> +}
> +

A mempool sounds an overkill here. You could simply statically allocate an
array of current data structure and let each flush bio hash to one of the
array. I bet that will only change a few lines of current code and fix the lock
contention.

>  static struct ctl_table_header *raid_table_header;
>  
>  static struct ctl_table raid_table[] = {
> @@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
>  /*
>   * Generic flush handling for md
>   */
> +static void submit_flushes(struct work_struct *ws)
> +{
> +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> +	struct mddev *mddev = fi->mddev;
> +	struct bio *bio = fi->bio;
> +
> +	bio->bi_opf &= ~REQ_PREFLUSH;
> +	md_handle_request(mddev, bio);
> +
> +	mempool_free(fi, mddev->flush_pool);
> +}
>  
> -static void md_end_flush(struct bio *bio)
> +static void md_end_flush(struct bio *fbio)
>  {
> -	struct md_rdev *rdev = bio->bi_private;
> -	struct mddev *mddev = rdev->mddev;
> +	struct flush_bio *fb = fbio->bi_private;
> +	struct md_rdev *rdev = fb->rdev;
> +	struct flush_info *fi = fb->fi;
> +	struct bio *bio = fi->bio;
> +	struct mddev *mddev = fi->mddev;
>  
>  	rdev_dec_pending(rdev, mddev);
>  
> -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> -		/* The pre-request flush has finished */
> -		queue_work(md_wq, &mddev->flush_work);
> +	if (atomic_dec_and_test(&fi->flush_pending)) {
> +		if (bio->bi_iter.bi_size == 0)
> +			/* an empty barrier - all done */
> +			bio_endio(bio);
> +		else {
> +			INIT_WORK(&fi->flush_work, submit_flushes);
> +			queue_work(md_wq, &fi->flush_work);
> +		}
>  	}
> -	bio_put(bio);
> -}
>  
> -static void md_submit_flush_data(struct work_struct *ws);
> +	mempool_free(fb, mddev->flush_bio_pool);
> +	bio_put(fbio);
> +}
>  
> -static void submit_flushes(struct work_struct *ws)
> +void md_flush_request(struct mddev *mddev, struct bio *bio)
>  {
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
>  	struct md_rdev *rdev;
> +	struct flush_info *fi;
> +
> +	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
> +
> +	fi->bio = bio;
> +	fi->mddev = mddev;
> +	atomic_set(&fi->flush_pending, 0);
>  
> -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> -	atomic_set(&mddev->flush_pending, 1);
>  	rcu_read_lock();
>  	rdev_for_each_rcu(rdev, mddev)
>  		if (rdev->raid_disk >= 0 &&
> @@ -445,58 +486,28 @@ static void submit_flushes(struct work_struct *ws)
>  			 * we reclaim rcu_read_lock
>  			 */
>  			struct bio *bi;
> +			struct flush_bio *fb;
>  			atomic_inc(&rdev->nr_pending);
>  			atomic_inc(&rdev->nr_pending);
>  			rcu_read_unlock();
> +
> +			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
> +			fb->fi = fi;
> +			fb->rdev = rdev;
> +
>  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> -			bi->bi_end_io = md_end_flush;
> -			bi->bi_private = rdev;
>  			bio_set_dev(bi, rdev->bdev);
> +			bi->bi_end_io = md_end_flush;
> +			bi->bi_private = fb;
>  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> -			atomic_inc(&mddev->flush_pending);
> +
> +			atomic_inc(&fi->flush_pending);
>  			submit_bio(bi);

you set flush_pending to 0, which is wrong. after submit_bio, md_end_flush
could be called immediately, before other rdevs have chance to this loop.

Thanks,
Shaohua
> +
>  			rcu_read_lock();
>  			rdev_dec_pending(rdev, mddev);
>  		}
>  	rcu_read_unlock();
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		queue_work(md_wq, &mddev->flush_work);
> -}
> -
> -static void md_submit_flush_data(struct work_struct *ws)
> -{
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> -	struct bio *bio = mddev->flush_bio;
> -
> -	/*
> -	 * must reset flush_bio before calling into md_handle_request to avoid a
> -	 * deadlock, because other bios passed md_handle_request suspend check
> -	 * could wait for this and below md_handle_request could wait for those
> -	 * bios because of suspend check
> -	 */
> -	mddev->flush_bio = NULL;
> -	wake_up(&mddev->sb_wait);
> -
> -	if (bio->bi_iter.bi_size == 0)
> -		/* an empty barrier - all done */
> -		bio_endio(bio);
> -	else {
> -		bio->bi_opf &= ~REQ_PREFLUSH;
> -		md_handle_request(mddev, bio);
> -	}
> -}
> -
> -void md_flush_request(struct mddev *mddev, struct bio *bio)
> -{
> -	spin_lock_irq(&mddev->lock);
> -	wait_event_lock_irq(mddev->sb_wait,
> -			    !mddev->flush_bio,
> -			    mddev->lock);
> -	mddev->flush_bio = bio;
> -	spin_unlock_irq(&mddev->lock);
> -
> -	INIT_WORK(&mddev->flush_work, submit_flushes);
> -	queue_work(md_wq, &mddev->flush_work);
>  }
>  EXPORT_SYMBOL(md_flush_request);
>  
> @@ -555,7 +566,6 @@ void mddev_init(struct mddev *mddev)
>  	atomic_set(&mddev->openers, 0);
>  	atomic_set(&mddev->active_io, 0);
>  	spin_lock_init(&mddev->lock);
> -	atomic_set(&mddev->flush_pending, 0);
>  	init_waitqueue_head(&mddev->sb_wait);
>  	init_waitqueue_head(&mddev->recovery_wait);
>  	mddev->reshape_position = MaxSector;
> @@ -5501,6 +5511,22 @@ int md_run(struct mddev *mddev)
>  			goto abort;
>  		}
>  	}
> +	if (mddev->flush_pool == NULL) {
> +		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
> +						flush_info_free, mddev);
> +		if (!mddev->flush_pool) {
> +			err = -ENOMEM;
> +			goto abort;
> +		}
> +	}
> +	if (mddev->flush_bio_pool == NULL) {
> +		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
> +						flush_bio_free, mddev);
> +		if (!mddev->flush_bio_pool) {
> +			err = -ENOMEM;
> +			goto abort;
> +		}
> +	}
>  
>  	spin_lock(&pers_lock);
>  	pers = find_pers(mddev->level, mddev->clevel);
> @@ -5665,6 +5691,14 @@ int md_run(struct mddev *mddev)
>  	return 0;
>  
>  abort:
> +	if (mddev->flush_bio_pool) {
> +		mempool_destroy(mddev->flush_bio_pool);
> +		mddev->flush_bio_pool = NULL;
> +	}
> +	if (mddev->flush_pool){
> +		mempool_destroy(mddev->flush_pool);
> +		mddev->flush_pool = NULL;
> +	}
>  	if (mddev->bio_set) {
>  		bioset_free(mddev->bio_set);
>  		mddev->bio_set = NULL;
> @@ -5867,6 +5901,14 @@ void md_stop(struct mddev *mddev)
>  	 * This is called from dm-raid
>  	 */
>  	__md_stop(mddev);
> +	if (mddev->flush_bio_pool) {
> +		mempool_destroy(mddev->flush_bio_pool);
> +		mddev->flush_bio_pool = NULL;
> +	}
> +	if (mddev->flush_pool) {
> +		mempool_destroy(mddev->flush_pool);
> +		mddev->flush_pool = NULL;
> +	}
>  	if (mddev->bio_set) {
>  		bioset_free(mddev->bio_set);
>  		mddev->bio_set = NULL;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7d6bcf0..1e40824 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -252,6 +252,19 @@ enum mddev_sb_flags {
>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
>  };
>  
> +#define NR_FLUSH_INFOS 8
> +#define NR_FLUSH_BIOS 64
> +struct flush_info {
> +	struct bio			*bio;
> +	struct mddev			*mddev;
> +	struct work_struct		flush_work;
> +	atomic_t			flush_pending;
> +};
> +struct flush_bio {
> +	struct flush_info *fi;
> +	struct md_rdev *rdev;
> +};
> +
>  struct mddev {
>  	void				*private;
>  	struct md_personality		*pers;
> @@ -457,13 +470,8 @@ struct mddev {
>  						   * metadata and bitmap writes
>  						   */
>  
> -	/* Generic flush handling.
> -	 * The last to finish preflush schedules a worker to submit
> -	 * the rest of the request (without the REQ_PREFLUSH flag).
> -	 */
> -	struct bio *flush_bio;
> -	atomic_t flush_pending;
> -	struct work_struct flush_work;
> +	mempool_t			*flush_pool;
> +	mempool_t			*flush_bio_pool;
>  	struct work_struct event_work;	/* used by dm to report failure event */
>  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>  	struct md_cluster_info		*cluster_info;
> -- 
> 2.7.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html