[PATCH V4 1/1] MD: fix lock contention for flush bios

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



There is a lock contention when there are many processes which send flush bios
to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv.

Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio
to be NULL, otherwise get mddev->lock.

This patch remove mddev->flush_bio and handle flush bio asynchronously.
I did a test with command dbench -s 128 -t 300. This is the test result:

=================Without the patch============================
 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                   305893     0.648    49.970
 Close                  3204015     0.013    36.673
 LockX                    14210     0.003     0.235
 Rename                  184914     0.038    25.821
 ReadX                  6844043     0.004     3.561
 WriteX                 2155550    17.461    84.646
 Unlink                  882276     0.152    37.705
 UnlockX                  14210     0.002     0.239
 FIND_FIRST             1528913     0.015    21.999
 SET_FILE_INFORMATION    355733     0.007     1.118
 QUERY_FILE_INFORMATION  689672     0.002     0.376
 QUERY_PATH_INFORMATION 3956994     0.006    49.292
 QUERY_FS_INFORMATION    725497     0.004     0.533
 NTCreateX              4363847     0.057    46.578

Throughput 453.259 MB/sec (sync open)  128 clients  128 procs  max_latency=84.650 m

 Operation                Count    AvgLat    MaxLat
 --------------------------------------------------
 Flush                   390388     0.036     4.742
 Close                  4088634     0.008    26.147
 LockX                    18110     0.003     0.148
 Rename                  235946     0.042    16.783
 ReadX                  8732131     0.004     2.327
 WriteX                 2750698    13.728    67.157
 Unlink                 1126423     0.148    47.070
 UnlockX                  18110     0.002     0.151
 FIND_FIRST             1951482     0.015    32.534
 SET_FILE_INFORMATION    453812     0.008     1.433
 QUERY_FILE_INFORMATION  879847     0.002     0.540
 QUERY_PATH_INFORMATION 5050251     0.006    47.689
 QUERY_FS_INFORMATION    926105     0.004     0.548
 NTCreateX              5569313     0.044    46.724
Throughput 578.346 MB/sec (sync open)  128 clients  128 procs  max_latency=67.161 ms

V4: use address of fbio to do hash to choose free flush info.
V3:
Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device
and uses a simple bitmap to record which resource is free.

Fix a bug from v2. It should set flush_pending to 1 at first.

V2:
Neil pointed out two problems. One is counting error problem and another is return value
when allocat memory fails.
1. counting error problem
This isn't safe.  It is only safe to call rdev_dec_pending() on rdevs
that you previously called
                          atomic_inc(&rdev->nr_pending);
If an rdev was added to the list between the start and end of the flush,
this will do something bad.

Now it doesn't use bio_chain. It uses specified call back function for each
flush bio.
2. Returned on IO error when kmalloc fails is wrong.
I use mempool suggested by Neil in V2
3. Fixed some places pointed by Guoqing

The test is done on one raid1 with 2 SSDS

Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
---
 drivers/md/md.c | 188 +++++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/md.h |  23 ++++---
 2 files changed, 149 insertions(+), 62 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 254e44e..c5084c6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -367,6 +367,7 @@ void mddev_suspend(struct mddev *mddev)
 	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
 	smp_mb__after_atomic();
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+	wait_event(mddev->sb_wait, atomic_read(&mddev->flush_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
 	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
@@ -412,30 +413,77 @@ static int md_congested(void *data, int bits)
 /*
  * Generic flush handling for md
  */
+static void submit_flushes(struct work_struct *ws)
+{
+	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
+	struct mddev *mddev = fi->mddev;
+	struct bio *fbio = fi->fbio;
+
+	fi->fbio = NULL;
+	atomic_dec(&mddev->flush_io);
+	wake_up(&fi->flush_queue);
+	wake_up(&mddev->sb_wait);
 
-static void md_end_flush(struct bio *bio)
+	fbio->bi_opf &= ~REQ_PREFLUSH;
+	md_handle_request(mddev, fbio);
+}
+
+static void rdev_end_flush(struct bio *bi)
 {
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
+	struct flush_info *fi = bi->bi_private;
+	struct mddev *mddev = fi->mddev;
+	struct bio *fbio = fi->fbio;
+	struct md_rdev *rdev;
 
-	rdev_dec_pending(rdev, mddev);
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev)
+		if (fi->bios[rdev->raid_disk] == bi) {
+			fi->bios[rdev->raid_disk] = NULL;
+			rdev_dec_pending(rdev, mddev);
+			break;
+		}
+	rcu_read_unlock();
 
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (fbio->bi_iter.bi_size == 0) {
+			/* an empty barrier - all done */
+			bio_endio(fbio);
+			fi->fbio = NULL;
+			atomic_dec(&mddev->flush_io);
+			wake_up(&fi->flush_queue);
+			wake_up(&mddev->sb_wait);
+		} else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
-	bio_put(bio);
-}
 
-static void md_submit_flush_data(struct work_struct *ws);
+	bio_put(bi);
+}
 
-static void submit_flushes(struct work_struct *ws)
+void md_flush_request(struct mddev *mddev, struct bio *fbio)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct md_rdev *rdev;
+	struct flush_info *fi;
+	char *p = (char*)mddev->flush_info;
+	int index;
+
+	atomic_inc(&mddev->flush_io);
+
+	index = jhash((void*)fbio, sizeof(fbio), 0) % NR_FLUSHS;
+	fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
+			+ mddev->raid_disks * sizeof(struct bio*)));
+
+	spin_lock_irq(&fi->flush_lock);
+	wait_event_lock_irq(fi->flush_queue,
+			    !fi->fbio,
+			    fi->flush_lock);
+	fi->fbio = fbio;
+	spin_unlock_irq(&fi->flush_lock);
+
+	fi->mddev = mddev;
+	atomic_set(&fi->flush_pending, 1);
 
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
@@ -448,56 +496,36 @@ static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
+
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
+			bi->bi_end_io = rdev_end_flush;
+			bi->bi_private = fi;
 			bio_set_dev(bi, rdev->bdev);
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
+
+			fi->bios[rdev->raid_disk] = bi;
+			atomic_inc(&fi->flush_pending);
 			submit_bio(bi);
+
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
 
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
-
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	mddev->flush_bio = NULL;
-	wake_up(&mddev->sb_wait);
-
-	if (bio->bi_iter.bi_size == 0)
-		/* an empty barrier - all done */
-		bio_endio(bio);
-	else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		if (fbio->bi_iter.bi_size == 0) {
+			/* an empty barrier - all done */
+			bio_endio(fbio);
+			fi->fbio = NULL;
+			atomic_dec(&mddev->flush_io);
+			wake_up(&fi->flush_queue);
+			wake_up(&mddev->sb_wait);
+		} else {
+			INIT_WORK(&fi->flush_work, submit_flushes);
+			queue_work(md_wq, &fi->flush_work);
+		}
 	}
 }
-
-void md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio,
-			    mddev->lock);
-	mddev->flush_bio = bio;
-	spin_unlock_irq(&mddev->lock);
-
-	INIT_WORK(&mddev->flush_work, submit_flushes);
-	queue_work(md_wq, &mddev->flush_work);
-}
 EXPORT_SYMBOL(md_flush_request);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -555,7 +583,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
 	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
@@ -5509,6 +5536,27 @@ int md_run(struct mddev *mddev)
 			goto abort;
 		}
 	}
+	if (mddev->flush_info == NULL) {
+		int index = 0;
+		char *p;
+		struct flush_info *fi;
+		mddev->flush_info = kzalloc((sizeof(struct flush_info) +
+						sizeof(struct bio*) * mddev->raid_disks) *
+						NR_FLUSHS, GFP_KERNEL);
+		if (!mddev->flush_info) {
+			err = -ENOMEM;
+			goto abort;
+		}
+
+		p = (char*)mddev->flush_info;
+		while (index < NR_FLUSHS) {
+			fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
+				+ mddev->raid_disks * sizeof(struct bio*)));
+			spin_lock_init(&fi->flush_lock);
+			init_waitqueue_head(&fi->flush_queue);
+			index++;
+		}
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -5668,6 +5716,9 @@ int md_run(struct mddev *mddev)
 	return 0;
 
 abort:
+	if (mddev->flush_info) {
+		kfree(mddev->flush_info);
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -5888,6 +5939,10 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
+	if (mddev->flush_info) {
+		kfree(mddev->flush_info);
+		mddev->flush_info = NULL;
+	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -6854,8 +6909,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 
 static int update_raid_disks(struct mddev *mddev, int raid_disks)
 {
-	int rv;
+	int rv, index;
 	struct md_rdev *rdev;
+	struct flush_info *new, *fi;
+	char *p;
 	/* change the number of raid disks */
 	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
@@ -6884,10 +6941,31 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 	else if (mddev->delta_disks > 0)
 		mddev->reshape_backwards = 0;
 
+	new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) *
+				raid_disks) * NR_FLUSHS, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	p = (char*)new;
+	index = 0;
+	while (index < NR_FLUSHS) {
+		fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
+				+ raid_disks * sizeof(struct bio*)));
+		spin_lock_init(&fi->flush_lock);
+		init_waitqueue_head(&fi->flush_queue);
+		index++;
+	}
+
 	rv = mddev->pers->check_reshape(mddev);
 	if (rv < 0) {
 		mddev->delta_disks = 0;
 		mddev->reshape_backwards = 0;
+		kfree(new);
+	} else {
+		mddev_suspend(mddev);
+		kfree(mddev->flush_info);
+		mddev->flush_info = new;
+		mddev_resume(mddev);
 	}
 	return rv;
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925c..24f22f5 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,17 @@ enum mddev_sb_flags {
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
 };
 
+#define NR_FLUSHS 16
+struct flush_info {
+	struct mddev			*mddev;
+	struct work_struct		flush_work;
+	atomic_t			flush_pending;
+	spinlock_t			flush_lock;
+	wait_queue_head_t		flush_queue;
+	struct bio			*fbio;
+	struct bio			*bios[0];
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -399,7 +410,6 @@ struct mddev {
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
 	/* "lock" protects:
-	 *   flush_bio transition from NULL to !NULL
 	 *   rdev superblocks, events
 	 *   clearing MD_CHANGE_*
 	 *   in_sync - and related safemode and MD_CHANGE changes
@@ -457,13 +467,12 @@ struct mddev {
 						   * metadata and bitmap writes
 						   */
 
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
+	/*
+	 * Generic flush handling.
 	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	struct work_struct flush_work;
+	struct flush_info		*flush_info;
+	atomic_t			flush_io;
+
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux