[RFC PATCH] raid10: barrier reimplementation

Lidong Zhong <lzhong@xxxxxxxx> · Tue, 13 Mar 2018 17:24:50 +0800

Most of the patch is based on Coly's patch for md/raid1
fd76863e37fe(RAID1: a new I/O barrier implementation to remove resync window)
and several other patches related to barrier implementation in raid1,
such as:
07169fd478ed raid1: Replace raise_barrier/lower_barrier with
freeze_array/unfreeze_array when reconfiguring the array
43ac9b84a399 md/raid1: Use a new variable to count flighting sync
requests
f6eca2d43ed6 raid1: prevent freeze_array/wait_all_barriers deadlock

I have done some basic tests with fio. The test script is like:

[global]
bsrange=4-16k
iodepth=2
size=1G
numjobs=1
[read]
ioengine=sync
rw=read
fsync=1
[write]
ioengine=psync
rw=write
fsync=1
[mixed]
ioengine=sync
rw=rw
fsync=1
[randmixed]
ioengine=libaio
rw=randrw
fsync=1

There is no problem for normal IO operation and disk plug/unplug
while fio running in my test. But there are some other scenarios
need to be tested in the future. I am sending this version patch
for your comments first before I finish all tests in my mind.

Signed-off-by: Lidong Zhong <lzhong@xxxxxxxx>
---
 drivers/md/raid1-10.c |   7 ++
 drivers/md/raid1.h    |   5 -
 drivers/md/raid10.c   | 311 ++++++++++++++++++++++++++++++++++++--------------
 drivers/md/raid10.h   |  36 +++++-
 4 files changed, 262 insertions(+), 97 deletions(-)

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 400001b815db..6d2ee38b0a15 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -80,3 +80,10 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
 		size -= len;
 	} while (idx++ < RESYNC_PAGES && size > 0);
 }
+
+/* sector mapped to bucket */
+static inline int sector_to_idx(sector_t sector)
+{
+	return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+			BARRIER_BUCKETS_NR_BITS);
+}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index eb84bc68e2fd..f8f5d2886dc1 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -210,9 +210,4 @@ enum r1bio_state {
 	R1BIO_FailFast,
 };
 
-static inline int sector_to_idx(sector_t sector)
-{
-	return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
-			 BARRIER_BUCKETS_NR_BITS);
-}
 #endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c5e6c60fc0d4..19b5d91f3f2e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,8 +97,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r10conf *conf);
-static void lower_barrier(struct r10conf *conf);
+static void allow_barrier(struct r10conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r10conf *conf, sector_t sector_nr);
 static int _enough(struct r10conf *conf, int previous, int ignore);
 static int enough(struct r10conf *conf, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
@@ -297,10 +297,11 @@ static void free_r10bio(struct r10bio *r10_bio)
 static void put_buf(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
+	sector_t sect = r10_bio->sector;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
-	lower_barrier(conf);
+	lower_barrier(conf, sect);
 }
 
 static void reschedule_retry(struct r10bio *r10_bio)
@@ -308,10 +309,12 @@ static void reschedule_retry(struct r10bio *r10_bio)
 	unsigned long flags;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
+	int idx;
 
+	idx = sector_to_idx(r10_bio->sector);
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
-	conf->nr_queued ++;
+	atomic_inc(&conf->nr_queued[idx]);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	/* wake up frozen array... */
@@ -329,6 +332,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
 	struct r10conf *conf = r10_bio->mddev->private;
+	sector_t sect = bio->bi_iter.bi_sector;
 
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
@@ -338,7 +342,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
-	allow_barrier(conf);
+	allow_barrier(conf, sect);
 
 	free_r10bio(r10_bio);
 }
@@ -959,71 +963,177 @@ static void flush_pending_writes(struct r10conf *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-
-static void raise_barrier(struct r10conf *conf, int force)
+static void raise_barrier(struct r10conf *conf, int force, sector_t sector_nr)
 {
-	BUG_ON(force && !conf->barrier);
+	int idx = sector_to_idx(sector_nr);
+
 	spin_lock_irq(&conf->resync_lock);
 
-	/* Wait until no block IO is waiting (unless 'force') */
-	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
+	/* Wait until no block IO is waiting */
+	wait_event_lock_irq(conf->wait_barrier,
+			    force || !atomic_read(&conf->nr_waiting[idx]),
 			    conf->resync_lock);
 
 	/* block any new IO from starting */
-	conf->barrier++;
-
-	/* Now wait for all pending IO to complete */
+	atomic_inc(&conf->barrier[idx]);
+	/*
+	 * In raise_barrier() we firstly increase conf->barrier[idx] then
+	 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+	 * increase conf->nr_pending[idx] then check conf->barrier[idx].
+	 * A memory barrier here to make sure conf->nr_pending[idx] won't
+	 * be fetched before conf->barrier[idx] is increased. Otherwise
+	 * there will be a race between raise_barrier() and _wait_barrier().
+	 */
+	smp_mb__after_atomic();
+
+	/* For these conditions we must wait:
+	 * A: while the array is in frozen state
+	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+	 *    existing in corresponding I/O barrier bucket.
+	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+	 *    max resync count which allowed on current I/O barrier bucket.
+	 */
 	wait_event_lock_irq(conf->wait_barrier,
-			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
+			    !conf->array_frozen &&
+			     !atomic_read(&conf->nr_pending[idx]) &&
+			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
 			    conf->resync_lock);
 
+	atomic_inc(&conf->nr_sync_pending);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r10conf *conf)
+static void lower_barrier(struct r10conf *conf, sector_t sector_nr)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier--;
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	int idx = sector_to_idx(sector_nr);
+
+	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
+
+	atomic_dec(&conf->barrier[idx]);
+	atomic_dec(&conf->nr_sync_pending);
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r10conf *conf)
+static void __wait_barrier(struct r10conf *conf, int idx)
 {
+	/*
+	 * We need to increase conf->nr_pending[idx] very early here,
+	 * then raise_barrier() can be blocked when it waits for
+	 * conf->nr_pending[idx] to be 0. Then we can avoid holding
+	 * conf->resync_lock when there is no barrier raised in same
+	 * barrier unit bucket. Also if the array is frozen, I/O
+	 * should be blocked until array is unfrozen.
+	 */
+	atomic_inc(&conf->nr_pending[idx]);
+	/*
+	 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+	 * check conf->barrier[idx]. In raise_barrier() we firstly increase
+	 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+	 * barrier is necessary here to make sure conf->barrier[idx] won't be
+	 * fetched before conf->nr_pending[idx] is increased. Otherwise there
+	 * will be a race between _wait_barrier() and raise_barrier().
+	 */
+	smp_mb__after_atomic();
+
+	/*
+	 * Don't worry about checking two atomic_t variables at same time
+	 * here. If during we check conf->barrier[idx], the array is
+	 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
+	 * 0, it is safe to return and make the I/O continue. Because the
+	 * array is frozen, all I/O returned here will eventually complete
+	 * or be queued, no race will happen. See code comment in
+	 * freeze_array().
+	 */
+	if (!READ_ONCE(conf->array_frozen) &&
+	    !atomic_read(&conf->barrier[idx]))
+		return;
+
+	/*
+	 * After holding conf->resync_lock, conf->nr_pending[idx]
+	 * should be decreased before waiting for barrier to drop.
+	 * Otherwise, we may encounter a race condition because
+	 * raise_barrer() might be waiting for conf->nr_pending[idx]
+	 * to be 0 at same time.
+	 */
 	spin_lock_irq(&conf->resync_lock);
-	if (conf->barrier) {
-		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * pre-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to get the nr_pending
-		 * count down.
-		 */
-		raid10_log(conf->mddev, "wait barrier");
-		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->barrier ||
-				    (atomic_read(&conf->nr_pending) &&
-				     current->bio_list &&
-				     (!bio_list_empty(&current->bio_list[0]) ||
-				      !bio_list_empty(&current->bio_list[1]))),
-				    conf->resync_lock);
-		conf->nr_waiting--;
-		if (!conf->nr_waiting)
-			wake_up(&conf->wait_barrier);
-	}
-	atomic_inc(&conf->nr_pending);
+	atomic_inc(&conf->nr_waiting[idx]);
+	atomic_dec(&conf->nr_pending[idx]);
+	/*
+	 * In case freeze_array() is waiting for
+	 * get_unqueued_pending() == extra
+	 */
+	wake_up(&conf->wait_barrier);
+	/* Wait for the barrier in same barrier unit bucket to drop. */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->array_frozen &&
+			     !atomic_read(&conf->barrier[idx]),
+			    conf->resync_lock);
+	atomic_inc(&conf->nr_pending[idx]);
+	atomic_dec(&conf->nr_waiting[idx]);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void allow_barrier(struct r10conf *conf)
+static void wait_barrier(struct r10conf *conf, sector_t sector_nr)
 {
-	if ((atomic_dec_and_test(&conf->nr_pending)) ||
-			(conf->array_freeze_pending))
-		wake_up(&conf->wait_barrier);
+	int idx = sector_to_idx(sector_nr);
+	__wait_barrier(conf, idx);
+}
+
+static void wait_read_barrier(struct r10conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+
+	/*
+	 * Very similar to _wait_barrier(). The difference is, for read
+	 * I/O we don't need wait for sync I/O, but if the whole array
+	 * is frozen, the read I/O still has to wait until the array is
+	 * unfrozen. Since there is no ordering requirement with
+	 * conf->barrier[idx] here, memory barrier is unnecessary as well.
+	 */
+	atomic_inc(&conf->nr_pending[idx]);
+
+	if (!READ_ONCE(conf->array_frozen))
+		return;
+
+	spin_lock_irq(&conf->resync_lock);
+	atomic_inc(&conf->nr_waiting[idx]);
+	atomic_dec(&conf->nr_pending[idx]);
+	/*
+	 * In case freeze_array() is waiting for
+	 * get_unqueued_pending() == extra
+	 */
+	wake_up(&conf->wait_barrier);
+	/* Wait for array to be unfrozen */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->array_frozen,
+			    conf->resync_lock);
+	atomic_inc(&conf->nr_pending[idx]);
+	atomic_dec(&conf->nr_waiting[idx]);
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void __allow_barrier(struct r10conf *conf, int idx)
+{
+	atomic_dec(&conf->nr_pending[idx]);
+	wake_up(&conf->wait_barrier);
+}
+
+static void allow_barrier(struct r10conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+	__allow_barrier(conf, idx);
+}
+
+static int get_unqueued_pending(struct r10conf *conf)
+{
+	int ret, idx;
+
+	ret = atomic_read(&conf->nr_sync_pending);
+	for(idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		ret += atomic_read(&conf->nr_pending[idx]) -
+			atomic_read(&conf->nr_queued[idx]);
+	
+	return ret;
 }
 
 static void freeze_array(struct r10conf *conf, int extra)
@@ -1041,15 +1151,12 @@ static void freeze_array(struct r10conf *conf, int extra)
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
-	conf->array_freeze_pending++;
-	conf->barrier++;
-	conf->nr_waiting++;
+	conf->array_frozen = 1;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
-				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
+				get_unqueued_pending(conf) == extra,
 				conf->resync_lock,
 				flush_pending_writes(conf));
 
-	conf->array_freeze_pending--;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -1057,8 +1164,7 @@ static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
-	conf->nr_waiting--;
+	conf->array_frozen = 0;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -1168,7 +1274,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf);
+	wait_read_barrier(conf, bio->bi_iter.bi_sector);
 
 	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
@@ -1179,12 +1285,12 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		 * pass
 		 */
 		raid10_log(conf->mddev, "wait reshape");
-		allow_barrier(conf);
+		allow_barrier(conf, bio->bi_iter.bi_sector);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		wait_read_barrier(conf, bio->bi_iter.bi_sector);
 	}
 
 	rdev = read_balance(conf, r10_bio, &max_sectors);
@@ -1312,6 +1418,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	struct md_rdev *blocked_rdev;
 	sector_t sectors;
 	int max_sectors;
+	sector_t sect = bio->bi_iter.bi_sector;
 
 	if ((mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1334,7 +1441,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	wait_barrier(conf);
+	wait_barrier(conf, sect);
 
 	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
@@ -1345,12 +1452,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		 * pass
 		 */
 		raid10_log(conf->mddev, "wait reshape");
-		allow_barrier(conf);
+		allow_barrier(conf, sect);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
 			   conf->reshape_progress >= bio->bi_iter.bi_sector +
 			   sectors);
-		wait_barrier(conf);
+		wait_barrier(conf, sect);
 	}
 
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
@@ -1497,10 +1604,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				rdev_dec_pending(rdev, mddev);
 			}
 		}
-		allow_barrier(conf);
+		allow_barrier(conf, sect);
 		raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+		wait_barrier(conf, sect);
 		goto retry_write;
 	}
 
@@ -1729,8 +1836,11 @@ static void print_conf(struct r10conf *conf)
 
 static void close_sync(struct r10conf *conf)
 {
-	wait_barrier(conf);
-	allow_barrier(conf);
+	int idx;
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
+		__wait_barrier(conf, idx);
+		__allow_barrier(conf, idx);
+	}
 
 	mempool_destroy(conf->r10buf_pool);
 	conf->r10buf_pool = NULL;
@@ -2634,7 +2744,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 		md_error(mddev, rdev);
 
 	rdev_dec_pending(rdev, mddev);
-	allow_barrier(conf);
+	allow_barrier(conf, bio->bi_iter.bi_sector);
 	r10_bio->state = 0;
 	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
 }
@@ -2647,7 +2757,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 	 * Or possibly if failed and we need to record
 	 * a bad block.
 	 */
-	int m;
+	int m, idx;
 	struct md_rdev *rdev;
 
 	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
@@ -2721,9 +2831,10 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			}
 		}
 		if (fail) {
+			idx = sector_to_idx(r10_bio->sector);
 			spin_lock_irq(&conf->device_lock);
 			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
-			conf->nr_queued++;
+			atomic_inc(&conf->nr_queued[idx]);
 			spin_unlock_irq(&conf->device_lock);
 			/*
 			 * In case freeze_array() is waiting for condition
@@ -2748,6 +2859,7 @@ static void raid10d(struct md_thread *thread)
 	struct r10conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
+	int idx;
 
 	md_check_recovery(mddev);
 
@@ -2755,17 +2867,15 @@ static void raid10d(struct md_thread *thread)
 	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 		LIST_HEAD(tmp);
 		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			while (!list_empty(&conf->bio_end_io_list)) {
-				list_move(conf->bio_end_io_list.prev, &tmp);
-				conf->nr_queued--;
-			}
-		}
+		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+			list_splice_init(&conf->bio_end_io_list, &tmp);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		while (!list_empty(&tmp)) {
 			r10_bio = list_first_entry(&tmp, struct r10bio,
 						   retry_list);
 			list_del(&r10_bio->retry_list);
+			idx = sector_to_idx(r10_bio->sector);
+			atomic_dec(&conf->nr_queued[idx]);
 			if (mddev->degraded)
 				set_bit(R10BIO_Degraded, &r10_bio->state);
 
@@ -2788,7 +2898,8 @@ static void raid10d(struct md_thread *thread)
 		}
 		r10_bio = list_entry(head->prev, struct r10bio, retry_list);
 		list_del(head->prev);
-		conf->nr_queued--;
+		idx = sector_to_idx(r10_bio->sector);
+		atomic_dec(&conf->nr_queued[idx]);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r10_bio->mddev;
@@ -2937,7 +3048,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	struct r10bio *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
-	int i;
+	int i, idx;
 	int max_sync;
 	sector_t sync_blocks;
 	sector_t sectors_skipped = 0;
@@ -3048,7 +3159,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	 * If there is non-resync activity waiting for a turn, then let it
 	 * though before starting on this new sync request.
 	 */
-	if (conf->nr_waiting)
+	idx = sector_to_idx(sector_nr);
+	if (atomic_read(&conf->nr_waiting[idx]))
 		schedule_timeout_uninterruptible(1);
 
 	/* Again, very different code for resync and recovery.
@@ -3132,7 +3244,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			r10_bio = raid10_alloc_init_r10buf(conf);
 			r10_bio->state = 0;
-			raise_barrier(conf, rb2 != NULL);
+			raise_barrier(conf, rb2 != NULL, sect);
 			atomic_set(&r10_bio->remaining, 0);
 
 			r10_bio->master_bio = (struct bio*)rb2;
@@ -3354,7 +3466,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
-		raise_barrier(conf, 0);
+		raise_barrier(conf, 0, sector_nr);
 		conf->next_resync = sector_nr;
 
 		r10_bio->master_bio = NULL;
@@ -3693,6 +3805,27 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf->mirrors)
 		goto out;
 
+	/*barriers*/
+	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, 
+			sizeof(atomic_t), GFP_KERNEL);
+	if(!conf->nr_pending)
+		goto out;
+
+	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, 
+			sizeof(atomic_t), GFP_KERNEL);
+	if(!conf->nr_waiting)
+		goto out;
+
+	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, 
+			sizeof(atomic_t), GFP_KERNEL);
+	if(!conf->nr_queued)
+		goto out;
+
+	conf->barrier = kcalloc(BARRIER_BUCKETS_NR, 
+			sizeof(atomic_t), GFP_KERNEL);
+	if(!conf->barrier)
+		goto out;
+
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
 		goto out;
@@ -3731,7 +3864,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
-	atomic_set(&conf->nr_pending, 0);
 
 	conf->thread = md_register_thread(raid10d, mddev, "raid10");
 	if (!conf->thread)
@@ -3745,6 +3877,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		mempool_destroy(conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
+		kfree(conf->nr_pending);
+		kfree(conf->nr_waiting);
+		kfree(conf->nr_queued);
+		kfree(conf->barrier);
 		if (conf->bio_split)
 			bioset_free(conf->bio_split);
 		kfree(conf);
@@ -3981,9 +4117,9 @@ static void raid10_quiesce(struct mddev *mddev, int quiesce)
 	struct r10conf *conf = mddev->private;
 
 	if (quiesce)
-		raise_barrier(conf, 0);
+		freeze_array(conf, 0);
 	else
-		lower_barrier(conf);
+		unfreeze_array(conf);
 }
 
 static int raid10_resize(struct mddev *mddev, sector_t sectors)
@@ -4061,7 +4197,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 				rdev->sectors = size;
 			}
-		conf->barrier = 1;
+		/* Array must appear to be quiesced */
+		conf->array_frozen = 1;
 	}
 
 	return conf;
@@ -4505,7 +4642,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	if (need_flush ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Need to update reshape_position in metadata */
-		wait_barrier(conf);
+		wait_barrier(conf, sector_nr);
 		mddev->reshape_position = conf->reshape_progress;
 		if (mddev->reshape_backwards)
 			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
@@ -4518,18 +4655,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
 			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-			allow_barrier(conf);
+			allow_barrier(conf, sector_nr);
 			return sectors_done;
 		}
 		conf->reshape_safe = mddev->reshape_position;
-		allow_barrier(conf);
+		allow_barrier(conf, sector_nr);
 	}
 
 read_more:
 	/* Now schedule reads for blocks from sector_nr to last */
 	r10_bio = raid10_alloc_init_r10buf(conf);
 	r10_bio->state = 0;
-	raise_barrier(conf, sectors_done != 0);
+	raise_barrier(conf, sectors_done != 0, sector_nr);
 	atomic_set(&r10_bio->remaining, 0);
 	r10_bio->mddev = mddev;
 	r10_bio->sector = sector_nr;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e2e8840de9bf..cac22e0bd14f 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,6 +2,31 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
+/*
+ * This part is copied from the implementation in raid1
+ * each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS	17
+#define BARRIER_UNIT_SECTOR_SIZE	(1<<17)
+/*
+ * In struct r10conf, the following members are related to I/O barrier
+ * buckets,
+ *	atomic_t	*nr_pending;
+ *	atomic_t	*nr_waiting;
+ *	atomic_t	*nr_queued;
+ *	atomic_t	*barrier;
+ * Each of them points to array of atomic_t variables, each array is
+ * designed to have BARRIER_BUCKETS_NR elements and occupy a single
+ * memory page. The data width of atomic_t variables is 4 bytes, equal
+ * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined
+ * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of
+ * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly
+ * occupies a single memory page.
+ */
+#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - ilog2(sizeof(atomic_t)))
+#define BARRIER_BUCKETS_NR		(1<<BARRIER_BUCKETS_NR_BITS)
+
 /* Note: raid10_info.rdev can be set to NULL asynchronously by
  * raid10_remove_disk.
  * There are three safe ways to access raid10_info.rdev.
@@ -78,11 +103,12 @@ struct r10conf {
 	int			pending_count;
 
 	spinlock_t		resync_lock;
-	atomic_t		nr_pending;
-	int			nr_waiting;
-	int			nr_queued;
-	int			barrier;
-	int			array_freeze_pending;
+	atomic_t        nr_sync_pending;
+	atomic_t		*nr_pending;
+	atomic_t		*nr_waiting;
+	atomic_t		*nr_queued;
+	atomic_t		*barrier;
+	int         array_frozen;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
 					    * (fresh device added).
-- 
2.13.6

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html