Re: [RFC PATCH V2] MD: fix lock contention for flush bios

Xiao Ni <xni@xxxxxxxxxx> · Mon, 26 Feb 2018 17:29:29 +0800

On 02/26/2018 03:11 AM, Shaohua Li wrote:
On Wed, Feb 21, 2018 at 12:56:48PM +0800, Xiao Ni wrote:
There is a lock contention when there are many processes which send flush bios
to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.

Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
to be NULL, otherwise get mddev->lock.

This patch remove mddev->flush_bio and handle flush bio asynchronously.
I did a test with command dbench -s 128 -t 4800. This is the test result:

=================Without the patch============================
  Operation      Count    AvgLat    MaxLat
  ----------------------------------------
  NTCreateX     129643     4.792 13249.555
  Close         101064     0.884 12500.318
  Rename          5498     2.364  8250.734
  Unlink         21305     6.728  9749.328
  Qpathinfo     118012     0.041   999.884
  Qfileinfo      26676     0.004     0.261
  Qfsinfo        19712     0.009     0.490
  Sfileinfo      12247     0.014     0.614
  Find           43408     0.025     0.528
  WriteX         90174  6622.484 66249.763
  ReadX         194811     0.022  2748.778
  LockX            392     0.007     0.041
  UnlockX          392     0.003     0.019
  Flush           9970  1694.938 37749.263

Throughput 1.05396 MB/sec (sync open)  128 clients  128 procs  max_latency=66249.775 ms

=====================With the patch===========================
  Operation      Count    AvgLat    MaxLat
   ----------------------------------------
   NTCreateX     903706     3.345  4500.722
   Close         667882     0.128  4500.736
   Rename         38387     0.566   750.312
   Unlink        178731    13.415  5249.599
   Qpathinfo     824087     1.129  4500.394
   Qfileinfo     145055     0.004     1.010
   Qfsinfo       147967     0.011     1.565
   Sfileinfo      74359     0.023     1.439
   Find          315728     0.096  2748.048
   WriteX        447580  1303.313  4000.197
   ReadX        1420065     0.010     3.587
   LockX           3012     0.006     0.415
   UnlockX         3012     0.004     0.067
   Flush          62019   398.955  1251.283

  Throughput 5.8709 MB/sec (sync open)  128 clients  128 procs  max_latency=5249.611 ms

V2:
Neil pointed out two problems. One is counting error problem and another is return value
when allocat memory fails.
1. counting error problem
This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
that you previously called
                           atomic_inc(&rdev->nr_pending);
If an rdev was added to the list between the start and end of the flush,
this will do something bad.

Now it doesn't use bio_chain. It uses specified call back function for each
flush bio.
2. Returned on IO error when kmalloc fails is wrong.
I use mempool suggested by Neil in V2
3. Fixed some places pointed by Guoqing

  The test was did on one raid10 with 4 SSDs.

Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
---
  drivers/md/md.c | 152 ++++++++++++++++++++++++++++++++++++--------------------
  drivers/md/md.h |  22 +++++---
  2 files changed, 112 insertions(+), 62 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0a9ed51..60c2c17 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
  		mddev->sync_speed_max : sysctl_speed_limit_max;
  }
  
+static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+{
+        return kzalloc(sizeof(struct flush_info), gfp_flags);
+}
+static void flush_info_free(void *flush_info, void *data)
+{
+        kfree(flush_info);
+}
+
+static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+{
+	return kzalloc(sizeof(struct flush_bio), gfp_flags);
+}
+static void flush_bio_free(void *flush_bio, void *data)
+{
+	kfree(flush_bio);
+}
+
A mempool sounds an overkill here. You could simply statically allocate an
array of current data structure and let each flush bio hash to one of the
array. I bet that will only change a few lines of current code and fix the lock
contention.
Hi Shaohua

At first I thought to alloc those memory during creating the raid 
device. In this patch it needs
to allocate struct flush_bio for each rdev. If the raid device is 
created with many disks (now the
max_disks is 1920), it needs to allocate max_disks * multi-flushthread 
flush_bios.

If the raid device is created with 1000 disks and we want to allow 8 
multi flush threads. The size of memory is:

sizeof(struct flush_bio) is 16
pages = 1000*16*8/4096 = 31.

Because of this I choose memory pool. When the flush bios finish, most 
memory can be freed.

  static struct ctl_table_header *raid_table_header;
  
  static struct ctl_table raid_table[] = {
@@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
  /*
   * Generic flush handling for md
   */
+static void submit_flushes(struct work_struct *ws)
+{
+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+	struct mddev *mddev = fi->mddev;
+	struct bio *bio = fi->bio;
+
+	bio->bi_opf &= ~REQ_PREFLUSH;
+	md_handle_request(mddev, bio);
+
+	mempool_free(fi, mddev->flush_pool);
+}
  
-static void md_end_flush(struct bio *bio)
+static void md_end_flush(struct bio *fbio)
  {
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
+	struct flush_bio *fb = fbio->bi_private;
+	struct md_rdev *rdev = fb->rdev;
+	struct flush_info *fi = fb->fi;
+	struct bio *bio = fi->bio;
+	struct mddev *mddev = fi->mddev;
  
  	rdev_dec_pending(rdev, mddev);
  
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (bio->bi_iter.bi_size == 0)
+			/* an empty barrier - all done */
+			bio_endio(bio);
+		else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
  	}
-	bio_put(bio);
-}
  
-static void md_submit_flush_data(struct work_struct *ws);
+	mempool_free(fb, mddev->flush_bio_pool);
+	bio_put(fbio);
+}
  
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
  {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
  	struct md_rdev *rdev;
+	struct flush_info *fi;
+
+	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
+
+	fi->bio = bio;
+	fi->mddev = mddev;
+	atomic_set(&fi->flush_pending, 0);
  
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
  	rcu_read_lock();
  	rdev_for_each_rcu(rdev, mddev)
  		if (rdev->raid_disk >= 0 &&
@@ -445,58 +486,28 @@ static void submit_flushes(struct work_struct *ws)
  			 * we reclaim rcu_read_lock
  			 */
  			struct bio *bi;
+			struct flush_bio *fb;
  			atomic_inc(&rdev->nr_pending);
  			atomic_inc(&rdev->nr_pending);
  			rcu_read_unlock();
+
+			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
+			fb->fi = fi;
+			fb->rdev = rdev;
+
  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
  			bio_set_dev(bi, rdev->bdev);
+			bi->bi_end_io = md_end_flush;
+			bi->bi_private = fb;
  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
+
+			atomic_inc(&fi->flush_pending);
  			submit_bio(bi);
you set flush_pending to 0, which is wrong. after submit_bio, md_end_flush
could be called immediately, before other rdevs have chance to this loop.
Got it. I'll init it as 1.

Best Regards
Xiao

Thanks,
Shaohua
+
  			rcu_read_lock();
  			rdev_dec_pending(rdev, mddev);
  		}
  	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
-
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	mddev->flush_bio = NULL;
-	wake_up(&mddev->sb_wait);
-
-	if (bio->bi_iter.bi_size == 0)
-		/* an empty barrier - all done */
-		bio_endio(bio);
-	else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
-	}
-}
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio,
-			    mddev->lock);
-	mddev->flush_bio = bio;
-	spin_unlock_irq(&mddev->lock);
-
-	INIT_WORK(&mddev->flush_work, submit_flushes);
-	queue_work(md_wq, &mddev->flush_work);
  }
  EXPORT_SYMBOL(md_flush_request);
  
@@ -555,7 +566,6 @@ void mddev_init(struct mddev *mddev)
  	atomic_set(&mddev->openers, 0);
  	atomic_set(&mddev->active_io, 0);
  	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
  	init_waitqueue_head(&mddev->sb_wait);
  	init_waitqueue_head(&mddev->recovery_wait);
  	mddev->reshape_position = MaxSector;
@@ -5501,6 +5511,22 @@ int md_run(struct mddev *mddev)
  			goto abort;
  		}
  	}
+	if (mddev->flush_pool == NULL) {
+		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
+						flush_info_free, mddev);
+		if (!mddev->flush_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
+	if (mddev->flush_bio_pool == NULL) {
+		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
+						flush_bio_free, mddev);
+		if (!mddev->flush_bio_pool) {
+			err = -ENOMEM;
+			goto abort;
+		}
+	}
  
  	spin_lock(&pers_lock);
  	pers = find_pers(mddev->level, mddev->clevel);
@@ -5665,6 +5691,14 @@ int md_run(struct mddev *mddev)
  	return 0;
  
  abort:
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool){
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
  	if (mddev->bio_set) {
  		bioset_free(mddev->bio_set);
  		mddev->bio_set = NULL;
@@ -5867,6 +5901,14 @@ void md_stop(struct mddev *mddev)
  	 * This is called from dm-raid
  	 */
  	__md_stop(mddev);
+	if (mddev->flush_bio_pool) {
+		mempool_destroy(mddev->flush_bio_pool);
+		mddev->flush_bio_pool = NULL;
+	}
+	if (mddev->flush_pool) {
+		mempool_destroy(mddev->flush_pool);
+		mddev->flush_pool = NULL;
+	}
  	if (mddev->bio_set) {
  		bioset_free(mddev->bio_set);
  		mddev->bio_set = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0..1e40824 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
  };
  
+#define NR_FLUSH_INFOS 8
+#define NR_FLUSH_BIOS 64
+struct flush_info {
+	struct bio			*bio;
+	struct mddev			*mddev;
+	struct work_struct		flush_work;
+	atomic_t			flush_pending;
+};
+struct flush_bio {
+	struct flush_info *fi;
+	struct md_rdev *rdev;
+};
+
  struct mddev {
  	void				*private;
  	struct md_personality		*pers;
@@ -457,13 +470,8 @@ struct mddev {
  						   * metadata and bitmap writes
  						   */
  
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
-	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	struct work_struct flush_work;
+	mempool_t			*flush_pool;
+	mempool_t			*flush_bio_pool;
  	struct work_struct event_work;	/* used by dm to report failure event */
  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
  	struct md_cluster_info		*cluster_info;
--
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html