Re: [PATCH V4 1/1] MD: fix lock contention for flush bios

Xiao Ni <xni@xxxxxxxxxx> · Mon, 9 Apr 2018 21:33:13 -0400 (EDT)

----- Original Message -----
> From: "Shaohua Li" <shli@xxxxxxxxxx>
> To: "Xiao Ni" <xni@xxxxxxxxxx>
> Cc: linux-raid@xxxxxxxxxxxxxxx, "ming lei" <ming.lei@xxxxxxxxxx>, ncroxon@xxxxxxxxxx, neilb@xxxxxxxx
> Sent: Monday, April 9, 2018 11:52:20 PM
> Subject: Re: [PATCH V4 1/1] MD: fix lock contention for flush bios
> 
> On Fri, Apr 06, 2018 at 02:56:11PM +0800, Xiao Ni wrote:
> > There is a lock contention when there are many processes which send flush
> > bios
> > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each
> > lv.
> > 
> > Now it just can handle flush request sequentially. It needs to wait
> > mddev->flush_bio
> > to be NULL, otherwise get mddev->lock.
> > 
> > This patch remove mddev->flush_bio and handle flush bio asynchronously.
> > I did a test with command dbench -s 128 -t 300. This is the test result:
> > 
> > =================Without the patch============================
> >  Operation                Count    AvgLat    MaxLat
> >  --------------------------------------------------
> >  Flush                   305893     0.648    49.970
> >  Close                  3204015     0.013    36.673
> >  LockX                    14210     0.003     0.235
> >  Rename                  184914     0.038    25.821
> >  ReadX                  6844043     0.004     3.561
> >  WriteX                 2155550    17.461    84.646
> >  Unlink                  882276     0.152    37.705
> >  UnlockX                  14210     0.002     0.239
> >  FIND_FIRST             1528913     0.015    21.999
> >  SET_FILE_INFORMATION    355733     0.007     1.118
> >  QUERY_FILE_INFORMATION  689672     0.002     0.376
> >  QUERY_PATH_INFORMATION 3956994     0.006    49.292
> >  QUERY_FS_INFORMATION    725497     0.004     0.533
> >  NTCreateX              4363847     0.057    46.578
> > 
> > Throughput 453.259 MB/sec (sync open)  128 clients  128 procs
> > max_latency=84.650 m
> > 
> > =====================With the patch===========================
> >  Operation                Count    AvgLat    MaxLat
> >  --------------------------------------------------
> >  Flush                   390444     0.026     3.360
> >  Close                  4089104     0.008    28.907
> >  LockX                    18090     0.003     0.146
> >  Rename                  235907     0.042    19.649
> >  ReadX                  8728080     0.004     1.926
> >  WriteX                 2750498    13.734    65.028
> >  Unlink                 1126388     0.144    33.684
> >  UnlockX                  18090     0.002     0.078
> >  FIND_FIRST             1951009     0.015    24.111
> >  SET_FILE_INFORMATION    454013     0.008     1.866
> >  QUERY_FILE_INFORMATION  879798     0.002     0.746
> >  QUERY_PATH_INFORMATION 5050540     0.006    41.920
> >  QUERY_FS_INFORMATION    925775     0.004     0.322
> >  NTCreateX              5569810     0.043    29.966
> > 
> > Throughput 578.195 MB/sec (sync open)  128 clients  128 procs
> > max_latency=65.033
> > 
> > V4: use jhash to choose free flush info
> > V3:
> > Shaohua suggests mempool is overkill. In v3 it allocs memory during
> > creating raid device
> > and uses a simple bitmap to record which resource is free.
> > 
> > Fix a bug from v2. It should set flush_pending to 1 at first.
> > 
> > V2:
> > Neil pointed out two problems. One is counting error problem and another is
> > return value
> > when allocat memory fails.
> > 1. counting error problem
> > This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
> > that you previously called
> >                           atomic_inc(&rdev->nr_pending);
> > If an rdev was added to the list between the start and end of the flush,
> > this will do something bad.
> > 
> > Now it doesn't use bio_chain. It uses specified call back function for each
> > flush bio.
> > 2. Returned on IO error when kmalloc fails is wrong.
> > I use mempool suggested by Neil in V2
> > 3. Fixed some places pointed by Guoqing
> > 
> > The test is done on one raid1 with 2 SSDS
> > 
> > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
> > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
> > ---
> >  drivers/md/md.c | 189
> >  +++++++++++++++++++++++++++++++++++++++-----------------
> >  drivers/md/md.h |  23 ++++---
> >  2 files changed, 150 insertions(+), 62 deletions(-)
> > 
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 254e44e..96722fc 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -367,6 +367,7 @@ void mddev_suspend(struct mddev *mddev)
> >  	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
> >  	smp_mb__after_atomic();
> >  	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
> > +	wait_event(mddev->sb_wait, atomic_read(&mddev->flush_io) == 0);
> >  	mddev->pers->quiesce(mddev, 1);
> >  	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
> >  	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
> > @@ -412,30 +413,78 @@ static int md_congested(void *data, int bits)
> >  /*
> >   * Generic flush handling for md
> >   */
> > +static void submit_flushes(struct work_struct *ws)
> > +{
> > +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> > +	struct mddev *mddev = fi->mddev;
> > +	struct bio *fbio = fi->fbio;
> > +
> > +	fi->fbio = NULL;
> > +	atomic_dec(&mddev->flush_io);
> > +	wake_up(&fi->flush_queue);
> > +	wake_up(&mddev->sb_wait);
> >  
> > -static void md_end_flush(struct bio *bio)
> > +	fbio->bi_opf &= ~REQ_PREFLUSH;
> > +	md_handle_request(mddev, fbio);
> > +}
> > +
> > +static void rdev_end_flush(struct bio *bi)
> >  {
> > -	struct md_rdev *rdev = bio->bi_private;
> > -	struct mddev *mddev = rdev->mddev;
> > +	struct flush_info *fi = bi->bi_private;
> > +	struct mddev *mddev = fi->mddev;
> > +	struct bio *fbio = fi->fbio;
> > +	struct md_rdev *rdev;
> >  
> > -	rdev_dec_pending(rdev, mddev);
> > +	rcu_read_lock();
> > +	rdev_for_each_rcu(rdev, mddev)
> > +		if (fi->bios[rdev->raid_disk] == bi) {
> > +			fi->bios[rdev->raid_disk] = NULL;
> > +			rdev_dec_pending(rdev, mddev);
> > +			break;
> > +		}
> > +	rcu_read_unlock();
> >  
> > -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> > -		/* The pre-request flush has finished */
> > -		queue_work(md_wq, &mddev->flush_work);
> > +	if (atomic_dec_and_test(&fi->flush_pending)) {
> > +		if (fbio->bi_iter.bi_size == 0) {
> > +			/* an empty barrier - all done */
> > +			bio_endio(fbio);
> > +			fi->fbio = NULL;
> > +			atomic_dec(&mddev->flush_io);
> > +			wake_up(&fi->flush_queue);
> > +			wake_up(&mddev->sb_wait);
> > +		} else {
> > +			INIT_WORK(&fi->flush_work, submit_flushes);
> > +			queue_work(md_wq, &fi->flush_work);
> > +		}
> >  	}
> > -	bio_put(bio);
> > -}
> >  
> > -static void md_submit_flush_data(struct work_struct *ws);
> > +	bio_put(bi);
> > +}
> >  
> > -static void submit_flushes(struct work_struct *ws)
> > +void md_flush_request(struct mddev *mddev, struct bio *fbio)
> >  {
> > -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> >  	struct md_rdev *rdev;
> > +	struct flush_info *fi;
> > +	char *p = (char*)mddev->flush_info;
> > +	int index;
> > +
> > +	atomic_inc(&mddev->flush_io);
> > +
> > +	index = jhash((void*)&fbio->bi_iter.bi_sector,
> > +			sizeof(sector_t), 0) % NR_FLUSHS;
> 
> If this is a pure flush request, the bi_sector is meaningless. so probably we
> could just use fbio to hash.

Ah yes! It's a good idea. I'll re-send the patch soon.

Regards
Xiao
> 
> > +	fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
> > +			+ mddev->raid_disks * sizeof(struct bio*)));
> > +
> > +	spin_lock_irq(&fi->flush_lock);
> > +	wait_event_lock_irq(fi->flush_queue,
> > +			    !fi->fbio,
> > +			    fi->flush_lock);
> > +	fi->fbio = fbio;
> > +	spin_unlock_irq(&fi->flush_lock);
> > +
> > +	fi->mddev = mddev;
> > +	atomic_set(&fi->flush_pending, 1);
> >  
> > -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> > -	atomic_set(&mddev->flush_pending, 1);
> >  	rcu_read_lock();
> >  	rdev_for_each_rcu(rdev, mddev)
> >  		if (rdev->raid_disk >= 0 &&
> > @@ -448,56 +497,36 @@ static void submit_flushes(struct work_struct *ws)
> >  			atomic_inc(&rdev->nr_pending);
> >  			atomic_inc(&rdev->nr_pending);
> >  			rcu_read_unlock();
> > +
> >  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> > -			bi->bi_end_io = md_end_flush;
> > -			bi->bi_private = rdev;
> > +			bi->bi_end_io = rdev_end_flush;
> > +			bi->bi_private = fi;
> >  			bio_set_dev(bi, rdev->bdev);
> >  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> > -			atomic_inc(&mddev->flush_pending);
> > +
> > +			fi->bios[rdev->raid_disk] = bi;
> > +			atomic_inc(&fi->flush_pending);
> >  			submit_bio(bi);
> > +
> >  			rcu_read_lock();
> >  			rdev_dec_pending(rdev, mddev);
> >  		}
> >  	rcu_read_unlock();
> > -	if (atomic_dec_and_test(&mddev->flush_pending))
> > -		queue_work(md_wq, &mddev->flush_work);
> > -}
> >  
> > -static void md_submit_flush_data(struct work_struct *ws)
> > -{
> > -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> > -	struct bio *bio = mddev->flush_bio;
> > -
> > -	/*
> > -	 * must reset flush_bio before calling into md_handle_request to avoid a
> > -	 * deadlock, because other bios passed md_handle_request suspend check
> > -	 * could wait for this and below md_handle_request could wait for those
> > -	 * bios because of suspend check
> > -	 */
> > -	mddev->flush_bio = NULL;
> > -	wake_up(&mddev->sb_wait);
> > -
> > -	if (bio->bi_iter.bi_size == 0)
> > -		/* an empty barrier - all done */
> > -		bio_endio(bio);
> > -	else {
> > -		bio->bi_opf &= ~REQ_PREFLUSH;
> > -		md_handle_request(mddev, bio);
> > +	if (atomic_dec_and_test(&fi->flush_pending)) {
> > +		if (fbio->bi_iter.bi_size == 0) {
> > +			/* an empty barrier - all done */
> > +			bio_endio(fbio);
> > +			fi->fbio = NULL;
> > +			atomic_dec(&mddev->flush_io);
> > +			wake_up(&fi->flush_queue);
> > +			wake_up(&mddev->sb_wait);
> > +		} else {
> > +			INIT_WORK(&fi->flush_work, submit_flushes);
> > +			queue_work(md_wq, &fi->flush_work);
> > +		}
> >  	}
> >  }
> > -
> > -void md_flush_request(struct mddev *mddev, struct bio *bio)
> > -{
> > -	spin_lock_irq(&mddev->lock);
> > -	wait_event_lock_irq(mddev->sb_wait,
> > -			    !mddev->flush_bio,
> > -			    mddev->lock);
> > -	mddev->flush_bio = bio;
> > -	spin_unlock_irq(&mddev->lock);
> > -
> > -	INIT_WORK(&mddev->flush_work, submit_flushes);
> > -	queue_work(md_wq, &mddev->flush_work);
> > -}
> >  EXPORT_SYMBOL(md_flush_request);
> >  
> >  static inline struct mddev *mddev_get(struct mddev *mddev)
> > @@ -555,7 +584,6 @@ void mddev_init(struct mddev *mddev)
> >  	atomic_set(&mddev->openers, 0);
> >  	atomic_set(&mddev->active_io, 0);
> >  	spin_lock_init(&mddev->lock);
> > -	atomic_set(&mddev->flush_pending, 0);
> >  	init_waitqueue_head(&mddev->sb_wait);
> >  	init_waitqueue_head(&mddev->recovery_wait);
> >  	mddev->reshape_position = MaxSector;
> > @@ -5509,6 +5537,27 @@ int md_run(struct mddev *mddev)
> >  			goto abort;
> >  		}
> >  	}
> > +	if (mddev->flush_info == NULL) {
> > +		int index = 0;
> > +		char *p;
> > +		struct flush_info *fi;
> > +		mddev->flush_info = kzalloc((sizeof(struct flush_info) +
> > +						sizeof(struct bio*) * mddev->raid_disks) *
> > +						NR_FLUSHS, GFP_KERNEL);
> > +		if (!mddev->flush_info) {
> > +			err = -ENOMEM;
> > +			goto abort;
> > +		}
> > +
> > +		p = (char*)mddev->flush_info;
> > +		while (index < NR_FLUSHS) {
> > +			fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
> > +				+ mddev->raid_disks * sizeof(struct bio*)));
> > +			spin_lock_init(&fi->flush_lock);
> > +			init_waitqueue_head(&fi->flush_queue);
> > +			index++;
> > +		}
> > +	}
> >  
> >  	spin_lock(&pers_lock);
> >  	pers = find_pers(mddev->level, mddev->clevel);
> > @@ -5668,6 +5717,9 @@ int md_run(struct mddev *mddev)
> >  	return 0;
> >  
> >  abort:
> > +	if (mddev->flush_info) {
> > +		kfree(mddev->flush_info);
> > +	}
> >  	if (mddev->bio_set) {
> >  		bioset_free(mddev->bio_set);
> >  		mddev->bio_set = NULL;
> > @@ -5888,6 +5940,10 @@ void md_stop(struct mddev *mddev)
> >  	 * This is called from dm-raid
> >  	 */
> >  	__md_stop(mddev);
> > +	if (mddev->flush_info) {
> > +		kfree(mddev->flush_info);
> > +		mddev->flush_info = NULL;
> > +	}
> >  	if (mddev->bio_set) {
> >  		bioset_free(mddev->bio_set);
> >  		mddev->bio_set = NULL;
> > @@ -6854,8 +6910,10 @@ static int update_size(struct mddev *mddev, sector_t
> > num_sectors)
> >  
> >  static int update_raid_disks(struct mddev *mddev, int raid_disks)
> >  {
> > -	int rv;
> > +	int rv, index;
> >  	struct md_rdev *rdev;
> > +	struct flush_info *new, *fi;
> > +	char *p;
> >  	/* change the number of raid disks */
> >  	if (mddev->pers->check_reshape == NULL)
> >  		return -EINVAL;
> > @@ -6884,10 +6942,31 @@ static int update_raid_disks(struct mddev *mddev,
> > int raid_disks)
> >  	else if (mddev->delta_disks > 0)
> >  		mddev->reshape_backwards = 0;
> >  
> > +	new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) *
> > +				raid_disks) * NR_FLUSHS, GFP_KERNEL);
> > +	if (!new)
> > +		return -ENOMEM;
> > +
> > +	p = (char*)new;
> > +	index = 0;
> > +	while (index < NR_FLUSHS) {
> > +		fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
> > +				+ raid_disks * sizeof(struct bio*)));
> > +		spin_lock_init(&fi->flush_lock);
> > +		init_waitqueue_head(&fi->flush_queue);
> > +		index++;
> > +	}
> > +
> >  	rv = mddev->pers->check_reshape(mddev);
> >  	if (rv < 0) {
> >  		mddev->delta_disks = 0;
> >  		mddev->reshape_backwards = 0;
> > +		kfree(new);
> > +	} else {
> > +		mddev_suspend(mddev);
> > +		kfree(mddev->flush_info);
> > +		mddev->flush_info = new;
> > +		mddev_resume(mddev);
> >  	}
> >  	return rv;
> >  }
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index fbc925c..24f22f5 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -252,6 +252,17 @@ enum mddev_sb_flags {
> >  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
> >  };
> >  
> > +#define NR_FLUSHS 16
> > +struct flush_info {
> > +	struct mddev			*mddev;
> > +	struct work_struct		flush_work;
> > +	atomic_t			flush_pending;
> > +	spinlock_t			flush_lock;
> > +	wait_queue_head_t		flush_queue;
> > +	struct bio			*fbio;
> > +	struct bio			*bios[0];
> > +};
> > +
> >  struct mddev {
> >  	void				*private;
> >  	struct md_personality		*pers;
> > @@ -399,7 +410,6 @@ struct mddev {
> >  	struct work_struct del_work;	/* used for delayed sysfs removal */
> >  
> >  	/* "lock" protects:
> > -	 *   flush_bio transition from NULL to !NULL
> >  	 *   rdev superblocks, events
> >  	 *   clearing MD_CHANGE_*
> >  	 *   in_sync - and related safemode and MD_CHANGE changes
> > @@ -457,13 +467,12 @@ struct mddev {
> >  						   * metadata and bitmap writes
> >  						   */
> >  
> > -	/* Generic flush handling.
> > -	 * The last to finish preflush schedules a worker to submit
> > -	 * the rest of the request (without the REQ_PREFLUSH flag).
> > +	/*
> > +	 * Generic flush handling.
> >  	 */
> > -	struct bio *flush_bio;
> > -	atomic_t flush_pending;
> > -	struct work_struct flush_work;
> > +	struct flush_info		*flush_info;
> > +	atomic_t			flush_io;
> > +
> >  	struct work_struct event_work;	/* used by dm to report failure event */
> >  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
> >  	struct md_cluster_info		*cluster_info;
> > --
> > 2.7.4
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html