Re: [PATCH V3 1/1] MD: fix lock contention for flush bios

Shaohua Li <shli@xxxxxxxxxx> · Sun, 1 Apr 2018 16:02:37 -0700

On Wed, Mar 21, 2018 at 02:47:22PM +0800, Xiao Ni wrote:
> There is a lock contention when there are many processes which send flush bios
> to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.
> 
> Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
> to be NULL, otherwise get mddev->lock.
> 
> This patch remove mddev->flush_bio and handle flush bio asynchronously.
> I did a test with command dbench -s 128 -t 4800. This is the test result:
> 
> =================Without the patch============================
>  Operation                Count    AvgLat    MaxLat
>  --------------------------------------------------
>  Flush                     5239   142.590  3972.034
>  Close                    53114     0.176   498.236
>  LockX                      208     0.066     0.907
>  Rename                    2793     0.335     7.203
>  ReadX                    98100     0.020     2.280
>  WriteX                   67800   555.649  8238.498
>  Unlink                    7985     1.742   446.503
>  UnlockX                    208     0.058     1.013
>  FIND_FIRST               21035     0.141     3.147
>  SET_FILE_INFORMATION      6419     0.090     1.539
>  QUERY_FILE_INFORMATION   18244     0.007     0.130
>  QUERY_PATH_INFORMATION   55622     0.060     3.884
>  QUERY_FS_INFORMATION      9451     0.040     1.148
>  NTCreateX                63960     0.717   536.542
> 
> Throughput 12.1782 MB/sec (sync open)  128 clients  128 procs  max_latency=8238.513 ms
> 
> =====================With the patch===========================
>  Operation                Count    AvgLat    MaxLat
>  --------------------------------------------------
>  Flush                    34858    36.484   668.243
>  Close                   379883     0.107   252.232
>  LockX                     1792     0.048     1.070
>  Rename                   21761     0.804   266.659
>  ReadX                   817947     0.021    42.891
>  WriteX                  254804   142.485   948.090
>  Unlink                   99665     3.590   899.816
>  UnlockX                   1792     0.056     1.240
>  FIND_FIRST              178857     0.187    23.287
>  SET_FILE_INFORMATION     41612     0.135    26.575
>  QUERY_FILE_INFORMATION   83691     0.007     2.589
>  QUERY_PATH_INFORMATION  470889     0.077    83.846
>  QUERY_FS_INFORMATION     82764     0.056    10.368
>  NTCreateX               512262     0.616   809.980
> 
> Throughput 53.6545 MB/sec (sync open)  128 clients  128 procs  max_latency=948.105 ms
> 
> V3:
> Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device
> and uses a simple bitmap to record which resource is free.

Sorry for the delay. The bitmap method is still too complicated. Can we do
something like this:

in mddev:
struct flush_info flush_infos[8 or 16]; Maybe don't need a special struct too.

the info can include a lock, every time we handle a flush, we select a
flush_info by flush_infos[jhash(flush_request_bio_address)];

we then take the lock and do whatever current code does against the specific
flush_info. Isn't this a simpler implementation? I think 8 or 16 locks should
reduce most of the lock contention.

Thanks,
Shaohua 
> Fix a bug from v2. It should set flush_pending to 1 at first.
> 
> V2:
> Neil pointed out two problems. One is counting error problem and another is return value
> when allocat memory fails.
> 1. counting error problem
> This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
> that you previously called
>                           atomic_inc(&rdev->nr_pending);
> If an rdev was added to the list between the start and end of the flush,
> this will do something bad.
> 
> Now it doesn't use bio_chain. It uses specified call back function for each
> flush bio.
> 2. Returned on IO error when kmalloc fails is wrong.
> I use mempool suggested by Neil in V2
> 3. Fixed some places pointed by Guoqing
> 
> The test is done on one raid1 with 2 SSDS
> 
> Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
> Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
> ---
>  drivers/md/md.c | 184 +++++++++++++++++++++++++++++++++++++++-----------------
>  drivers/md/md.h |  23 ++++---
>  2 files changed, 145 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 254e44e..dd9038b 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -412,30 +412,84 @@ static int md_congested(void *data, int bits)
>  /*
>   * Generic flush handling for md
>   */
> +static void submit_flushes(struct work_struct *ws)
> +{
> +	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
> +	struct mddev *mddev = fi->mddev;
> +	struct bio *fbio = fi->fbio;
> +
> +	clear_bit(fi->flush_id, mddev->flush_ids);
> +	wake_up(&mddev->flush_queue);
> +
> +	fbio->bi_opf &= ~REQ_PREFLUSH;
> +	md_handle_request(mddev, fbio);
> +}
>  
> -static void md_end_flush(struct bio *bio)
> +static void rdev_end_flush(struct bio *bi)
>  {
> -	struct md_rdev *rdev = bio->bi_private;
> -	struct mddev *mddev = rdev->mddev;
> +	struct flush_info *fi = bi->bi_private;
> +	struct mddev *mddev = fi->mddev;
> +	struct bio *fbio = fi->fbio;
> +	struct md_rdev *rdev;
>  
> -	rdev_dec_pending(rdev, mddev);
> +	rcu_read_lock();
> +	rdev_for_each_rcu(rdev, mddev)
> +		if (fi->bios[rdev->raid_disk] == bi) {
> +			fi->bios[rdev->raid_disk] = NULL;
> +			rdev_dec_pending(rdev, mddev);
> +			break;
> +		}
> +	rcu_read_unlock();
>  
> -	if (atomic_dec_and_test(&mddev->flush_pending)) {
> -		/* The pre-request flush has finished */
> -		queue_work(md_wq, &mddev->flush_work);
> +	if (atomic_dec_and_test(&fi->flush_pending)) {
> +		if (fbio->bi_iter.bi_size == 0) {
> +			/* an empty barrier - all done */
> +			bio_endio(fbio);
> +			clear_bit(fi->flush_id, mddev->flush_ids);
> +			wake_up(&mddev->flush_queue);
> +		} else {
> +			INIT_WORK(&fi->flush_work, submit_flushes);
> +			queue_work(md_wq, &fi->flush_work);
> +		}
>  	}
> -	bio_put(bio);
> -}
>  
> -static void md_submit_flush_data(struct work_struct *ws);
> +	bio_put(bi);
> +}
>  
> -static void submit_flushes(struct work_struct *ws)
> +void md_flush_request(struct mddev *mddev, struct bio *bio)
>  {
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
>  	struct md_rdev *rdev;
> +	struct flush_info *fi;
> +	unsigned long fi_bit;
> +	char *p = (char*)mddev->flush_info;
> +
> +	spin_lock_irq(&mddev->flush_lock);
> +	fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS);
> +	if (fi_bit == NR_FLUSHS) {
> +		DEFINE_WAIT(w);
> +		for (;;) {
> +			prepare_to_wait(&mddev->flush_queue,
> +					  &w, TASK_IDLE);
> +			fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS);
> +			if (fi_bit != NR_FLUSHS)
> +				break;
> +			spin_unlock_irq(&mddev->flush_lock);
> +			schedule();
> +			spin_lock_irq(&mddev->flush_lock);
> +		}
> +		finish_wait(&mddev->flush_queue, &w);
> +	}
> +
> +	set_bit(fi_bit, mddev->flush_ids);
> +	fi = (struct flush_info *)(p + fi_bit * (sizeof(struct flush_info)
> +			+ mddev->raid_disks * sizeof(struct bio*)));
> +	fi->flush_id = fi_bit;
> +	spin_unlock_irq(&mddev->flush_lock);
> +
> +	fi->mddev = mddev;
> +	fi->fbio = bio;
> +	atomic_set(&fi->flush_pending, 1);
>  
> -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> -	atomic_set(&mddev->flush_pending, 1);
>  	rcu_read_lock();
>  	rdev_for_each_rcu(rdev, mddev)
>  		if (rdev->raid_disk >= 0 &&
> @@ -448,56 +502,34 @@ static void submit_flushes(struct work_struct *ws)
>  			atomic_inc(&rdev->nr_pending);
>  			atomic_inc(&rdev->nr_pending);
>  			rcu_read_unlock();
> +
>  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> -			bi->bi_end_io = md_end_flush;
> -			bi->bi_private = rdev;
>  			bio_set_dev(bi, rdev->bdev);
> +			bi->bi_end_io = rdev_end_flush;
> +			bi->bi_private = fi;
>  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> -			atomic_inc(&mddev->flush_pending);
> +
> +			fi->bios[rdev->raid_disk] = bi;
> +			atomic_inc(&fi->flush_pending);
>  			submit_bio(bi);
> +
>  			rcu_read_lock();
>  			rdev_dec_pending(rdev, mddev);
>  		}
>  	rcu_read_unlock();
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		queue_work(md_wq, &mddev->flush_work);
> -}
>  
> -static void md_submit_flush_data(struct work_struct *ws)
> -{
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> -	struct bio *bio = mddev->flush_bio;
> -
> -	/*
> -	 * must reset flush_bio before calling into md_handle_request to avoid a
> -	 * deadlock, because other bios passed md_handle_request suspend check
> -	 * could wait for this and below md_handle_request could wait for those
> -	 * bios because of suspend check
> -	 */
> -	mddev->flush_bio = NULL;
> -	wake_up(&mddev->sb_wait);
> -
> -	if (bio->bi_iter.bi_size == 0)
> -		/* an empty barrier - all done */
> -		bio_endio(bio);
> -	else {
> -		bio->bi_opf &= ~REQ_PREFLUSH;
> -		md_handle_request(mddev, bio);
> +	if (atomic_dec_and_test(&fi->flush_pending)) {
> +		if (bio->bi_iter.bi_size == 0) {
> +			/* an empty barrier - all done */
> +			bio_endio(bio);
> +			clear_bit(fi->flush_id, mddev->flush_ids);
> +			wake_up(&mddev->flush_queue);
> +		} else {
> +			INIT_WORK(&fi->flush_work, submit_flushes);
> +			queue_work(md_wq, &fi->flush_work);
> +		}
>  	}
>  }
> -
> -void md_flush_request(struct mddev *mddev, struct bio *bio)
> -{
> -	spin_lock_irq(&mddev->lock);
> -	wait_event_lock_irq(mddev->sb_wait,
> -			    !mddev->flush_bio,
> -			    mddev->lock);
> -	mddev->flush_bio = bio;
> -	spin_unlock_irq(&mddev->lock);
> -
> -	INIT_WORK(&mddev->flush_work, submit_flushes);
> -	queue_work(md_wq, &mddev->flush_work);
> -}
>  EXPORT_SYMBOL(md_flush_request);
>  
>  static inline struct mddev *mddev_get(struct mddev *mddev)
> @@ -555,7 +587,7 @@ void mddev_init(struct mddev *mddev)
>  	atomic_set(&mddev->openers, 0);
>  	atomic_set(&mddev->active_io, 0);
>  	spin_lock_init(&mddev->lock);
> -	atomic_set(&mddev->flush_pending, 0);
> +	spin_lock_init(&mddev->flush_lock);
>  	init_waitqueue_head(&mddev->sb_wait);
>  	init_waitqueue_head(&mddev->recovery_wait);
>  	mddev->reshape_position = MaxSector;
> @@ -5509,6 +5541,24 @@ int md_run(struct mddev *mddev)
>  			goto abort;
>  		}
>  	}
> +	if (mddev->flush_info == NULL) {
> +		mddev->flush_info = kzalloc((sizeof(struct flush_info) +
> +						sizeof(struct bio*) * mddev->raid_disks) *
> +						NR_FLUSHS, GFP_KERNEL);
> +		if (!mddev->flush_info) {
> +			err = -ENOMEM;
> +			goto abort;
> +		}
> +	}
> +	if (mddev->flush_ids == NULL) {
> +		mddev->flush_ids = kzalloc(BITS_TO_LONGS(NR_FLUSHS) * sizeof(unsigned long), GFP_KERNEL);
> +		if (!mddev->flush_ids) {
> +			err = -ENOMEM;
> +			goto abort;
> +		}
> +	}
> +
> +	init_waitqueue_head(&mddev->flush_queue);
>  
>  	spin_lock(&pers_lock);
>  	pers = find_pers(mddev->level, mddev->clevel);
> @@ -5668,6 +5718,12 @@ int md_run(struct mddev *mddev)
>  	return 0;
>  
>  abort:
> +	if (mddev->flush_ids) {
> +		kfree(mddev->flush_ids);
> +	}
> +	if (mddev->flush_info) {
> +		kfree(mddev->flush_info);
> +	}
>  	if (mddev->bio_set) {
>  		bioset_free(mddev->bio_set);
>  		mddev->bio_set = NULL;
> @@ -5888,6 +5944,14 @@ void md_stop(struct mddev *mddev)
>  	 * This is called from dm-raid
>  	 */
>  	__md_stop(mddev);
> +	if (mddev->flush_ids) {
> +		kfree(mddev->flush_ids);
> +		mddev->flush_ids = NULL;
> +	}
> +	if (mddev->flush_info) {
> +		kfree(mddev->flush_info);
> +		mddev->flush_info = NULL;
> +	}
>  	if (mddev->bio_set) {
>  		bioset_free(mddev->bio_set);
>  		mddev->bio_set = NULL;
> @@ -6856,6 +6920,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
>  {
>  	int rv;
>  	struct md_rdev *rdev;
> +	struct flush_info *new;
>  	/* change the number of raid disks */
>  	if (mddev->pers->check_reshape == NULL)
>  		return -EINVAL;
> @@ -6884,10 +6949,21 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
>  	else if (mddev->delta_disks > 0)
>  		mddev->reshape_backwards = 0;
>  
> +	new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) *
> +				raid_disks) * NR_FLUSHS, GFP_KERNEL);
> +	if (!new)
> +		return -ENOMEM;
> +
>  	rv = mddev->pers->check_reshape(mddev);
>  	if (rv < 0) {
>  		mddev->delta_disks = 0;
>  		mddev->reshape_backwards = 0;
> +		kfree(new);
> +	} else {
> +		spin_lock(&mddev->flush_lock);
> +		kfree(mddev->flush_info);
> +		mddev->flush_info = new;
> +		spin_unlock(&mddev->flush_lock);
>  	}
>  	return rv;
>  }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index fbc925c..83bb62c 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -252,6 +252,16 @@ enum mddev_sb_flags {
>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
>  };
>  
> +#define NR_FLUSHS 16
> +struct flush_info {
> +	struct mddev			*mddev;
> +	struct work_struct		flush_work;
> +	int				flush_id;
> +	atomic_t			flush_pending;
> +	struct bio			*fbio;
> +	struct bio			*bios[0];
> +};
> +
>  struct mddev {
>  	void				*private;
>  	struct md_personality		*pers;
> @@ -399,7 +409,6 @@ struct mddev {
>  	struct work_struct del_work;	/* used for delayed sysfs removal */
>  
>  	/* "lock" protects:
> -	 *   flush_bio transition from NULL to !NULL
>  	 *   rdev superblocks, events
>  	 *   clearing MD_CHANGE_*
>  	 *   in_sync - and related safemode and MD_CHANGE changes
> @@ -457,13 +466,11 @@ struct mddev {
>  						   * metadata and bitmap writes
>  						   */
>  
> -	/* Generic flush handling.
> -	 * The last to finish preflush schedules a worker to submit
> -	 * the rest of the request (without the REQ_PREFLUSH flag).
> -	 */
> -	struct bio *flush_bio;
> -	atomic_t flush_pending;
> -	struct work_struct flush_work;
> +	struct flush_info		*flush_info;
> +	unsigned long			*flush_ids;
> +	wait_queue_head_t		flush_queue;
> +	spinlock_t			flush_lock;
> +
>  	struct work_struct event_work;	/* used by dm to report failure event */
>  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>  	struct md_cluster_info		*cluster_info;
> -- 
> 2.7.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html