[RFC PATCH V1] raid1: rewrite the iobarrier

majianpeng <majianpeng@xxxxxxxxx> · Thu, 24 Jan 2013 14:02:52 +0800

There is iobarrier in raid1 because two reason resync/recovery  or reconfigure the array.
At present,it suspend all nornal IO when reysync/recovey.
But if nornal IO is outrange the resync/recovery windwos,it  don't need to iobarrier.
So I rewrite the iobarrier.
Because the reasons of barrier are two,so i use two different methods.
First for resync/recovery, there is a reysnc window.The end position is 'next_resync'.Because the resync depth is  RESYNC_DEPTH(32),
so the start is 'next_resync - RESYNC_SECTOR * RESYNC_DEPTH'
The nornal IO Will be divided into three categories by the location.
a: before the start of resync window
b: between the resync window
c: after the end of resync window
For a, it don't barrier.
For b, it need barrier and used the original method
For c, it don't barrier but it need record the minimum position.If next resync is larger this, resync action will suspend.Otherwise versa.
I used rbtree to order those io.

For the reason of reconfigure of the arrary,I proposed a concept "force_barrier".When there is force_barrier, all Nornam IO must be suspended.

NOTE:
Because this problem is also for raid10, but i only do it for raid1. It is post out mainly to make sure it is 
going in the correct direction and hope to get some helpful comments from other guys.
If the methods is accepted,i will send the patch for raid10.

Signed-off-by: Jianpeng Ma <majianpeng@xxxxxxxxx>
---
 drivers/md/raid1.c |  218 +++++++++++++++++++++++++++++++++++++++++-----------
 drivers/md/raid1.h |   11 +++
 2 files changed, 184 insertions(+), 45 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc..3115fb8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,6 +37,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/rbtree.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
@@ -66,8 +67,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, int position);
+static void lower_barrier(struct r1conf *conf, int force);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -207,7 +208,7 @@ static void put_buf(struct r1bio *r1_bio)
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	lower_barrier(conf);
+	lower_barrier(conf, 0);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -235,6 +236,12 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
+	int position = 0;
+
+	if (test_and_clear_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state))
+		position = 1;
+	else if (test_and_clear_bit(R1BIO_AFTER_RESYNC, &r1_bio->state))
+		position = 2;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -249,11 +256,18 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	if (done) {
 		bio_endio(bio, 0);
+
+		if (position == 2) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->resync_lock, flags);
+			rb_erase(&r1_bio->node, &conf->rb_root);
+			spin_unlock_irqrestore(&conf->resync_lock, flags);
+		}
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf);
+		allow_barrier(conf, position);
 	}
 }
 
@@ -816,7 +830,7 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 #define RESYNC_DEPTH 32
 
-static void raise_barrier(struct r1conf *conf)
+static void raise_barrier(struct r1conf *conf, int force)
 {
 	spin_lock_irq(&conf->resync_lock);
 
@@ -826,56 +840,125 @@ static void raise_barrier(struct r1conf *conf)
 
 	/* block any new IO from starting */
 	conf->barrier++;
+	if (force)
+		conf->force_barrier++;
 
 	/* Now wait for all pending IO to complete */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+	if (force)
+		wait_event_lock_irq(conf->wait_barrier,
+			    !(conf->nr_pending + conf->nr_before + conf->nr_after) &&
+			    (conf->barrier + conf->force_barrier) < RESYNC_DEPTH,
+			    conf->resync_lock);
+	else
+		wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending &&
+			    (conf->barrier + conf->force_barrier) < RESYNC_DEPTH
+			    && !(conf->nr_after &&
+				(conf->next_resync + RESYNC_SECTORS < conf->near_position)),
 			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, int force)
 {
 	unsigned long flags;
 	BUG_ON(conf->barrier <= 0);
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
+	if (force)
+		conf->force_barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r1conf *conf)
+/*max resync/recovery size*/
+#define RESYNC_WINDOWS (RESYNC_DEPTH * RESYNC_SECTORS)
+static int  wait_barrier(struct r1conf *conf, struct bio *bio)
 {
+	int position = 0;
 	spin_lock_irq(&conf->resync_lock);
-	if (conf->barrier) {
+
+	if (conf->force_barrier || (conf->barrier && bio == NULL)) {
 		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * pre-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to get the nr_pending
-		 * count down.
-		 */
 		wait_event_lock_irq(conf->wait_barrier,
+			!(conf->force_barrier + conf->barrier) ||
+			(conf->barrier &&
+			 (conf->nr_pending + conf->nr_before + conf->nr_after) &&
+			 current->bio_list &&
+			 !bio_list_empty(current->bio_list)),
+			conf->resync_lock);
+		conf->nr_waiting--;
+		conf->nr_pending++;
+	} else if (test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery)
+		&& unlikely(bio != NULL)) {
+		/*before the resync window*/
+		if ((bio->bi_sector + bio_sectors(bio))  <
+			(conf->next_resync ?
+			 (conf->next_resync - RESYNC_WINDOWS) : 0)) {
+			conf->nr_before++;
+			position = 1;
+		} else if (bio->bi_sector >= conf->next_resync) {
+			conf->nr_after++;
+			position = 2;
+			if (bio->bi_sector < conf->near_position)
+				conf->near_position = bio->bi_sector;
+		} else {
+			if (conf->barrier) {
+				conf->nr_waiting++;
+			/* Wait for the barrier to drop.
+			 * However if there are already pending
+			 * requests (preventing the barrier from
+			 * rising completely), and the
+			 * pre-process bio queue isn't empty,
+			 * then don't wait, as we need to empty
+			 * that queue to get the nr_pending
+			 * count down.
+			 */
+			wait_event_lock_irq(conf->wait_barrier,
 				    !conf->barrier ||
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
 				    conf->resync_lock);
-		conf->nr_waiting--;
-	}
-	conf->nr_pending++;
+			conf->nr_waiting--;
+			}
+			conf->nr_pending++;
+		}
+	} else
+		conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
+	return position;
 }
 
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, int position)
 {
 	unsigned long flags;
+	struct rb_node *node;
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->nr_pending--;
+	switch (position) {
+	case 0:
+		conf->nr_pending--;
+		break;
+	case 1:
+		conf->nr_before--;
+		break;
+	case 2:
+		node = rb_first(&conf->rb_root);
+		conf->nr_after--;
+		if (node == NULL) {
+			BUG_ON(conf->nr_after);
+			conf->near_position = 0;
+		} else {
+			struct r1bio *r1_bio =
+				container_of(node, struct r1bio, node);
+			conf->near_position = r1_bio->sector;
+		}
+		break;
+	default:
+		BUG();
+	}
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
@@ -895,19 +978,19 @@ static void freeze_array(struct r1conf *conf)
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier++;
+	conf->force_barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+1,
-				conf->resync_lock,
-				flush_pending_writes(conf));
+		(conf->nr_pending + conf->nr_before + conf->nr_after)
+		== conf->nr_queued+1, conf->resync_lock,
+		flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
+	conf->force_barrier--;
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
@@ -986,6 +1069,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
+static bool raid1_rb_insert(struct rb_root *root, struct r1bio *bio)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	while (*new) {
+		struct r1bio *r1bio = container_of(*new, struct r1bio, node);
+
+		parent = *new;
+
+		if (bio->sector < r1bio->sector)
+			new = &((*new)->rb_left);
+		else if (bio->sector > r1bio->sector)
+			new = &((*new)->rb_right);
+		else
+			return false;
+	}
+	rb_link_node(&bio->node, parent, new);
+	rb_insert_color(&bio->node, root);
+
+	return true;
+
+}
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
@@ -1006,6 +1111,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	int position;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1035,7 +1141,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 		finish_wait(&conf->wait_barrier, &w);
 	}
 
-	wait_barrier(conf);
+	position = wait_barrier(conf, bio);
 
 	bitmap = mddev->bitmap;
 
@@ -1052,6 +1158,18 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
+	if (position == 1)
+		set_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state);
+	else if (position == 2) {
+		set_bit(R1BIO_AFTER_RESYNC, &r1_bio->state);
+
+		spin_lock_irq(&conf->resync_lock);
+		if (!raid1_rb_insert(&conf->rb_root, r1_bio))
+			BUG();
+		spin_unlock_irq(&conf->resync_lock);
+	}
+
+
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
 	 * track of the number of reads in bio->bi_phys_segments.
@@ -1229,9 +1347,16 @@ read_again:
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf);
+		allow_barrier(conf, position);
+
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+
+		position = wait_barrier(conf, bio);
+		if (position == 1)
+			set_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state);
+		else if (position == 2)
+			set_bit(R1BIO_AFTER_RESYNC, &r1_bio->state);
+
 		goto retry_write;
 	}
 
@@ -1434,11 +1559,12 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf);
-	allow_barrier(conf);
+	wait_barrier(conf, NULL);
+	allow_barrier(conf, 0);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
+	conf->next_resync = 0;
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -1550,8 +1676,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf);
-		lower_barrier(conf);
+		raise_barrier(conf, 1);
+		lower_barrier(conf, 1);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
@@ -1601,11 +1727,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			 */
 			struct md_rdev *repl =
 				conf->mirrors[conf->raid_disks + number].rdev;
-			raise_barrier(conf);
+			raise_barrier(conf, 1);
 			clear_bit(Replacement, &repl->flags);
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
-			lower_barrier(conf);
+			lower_barrier(conf, 1);
 			clear_bit(WantReplacement, &rdev->flags);
 		} else
 			clear_bit(WantReplacement, &rdev->flags);
@@ -2434,7 +2560,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
-	raise_barrier(conf);
+	raise_barrier(conf, 0);
 
 	conf->next_resync = sector_nr;
 
@@ -2734,6 +2860,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	conf->rb_root = RB_ROOT;
+
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2888,8 +3016,8 @@ static int stop(struct mddev *mddev)
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 
-	raise_barrier(conf);
-	lower_barrier(conf);
+	raise_barrier(conf, 1);
+	lower_barrier(conf, 1);
 
 	md_unregister_thread(&mddev->thread);
 	if (conf->r1bio_pool)
@@ -2998,7 +3126,7 @@ static int raid1_reshape(struct mddev *mddev)
 		return -ENOMEM;
 	}
 
-	raise_barrier(conf);
+	raise_barrier(conf, 1);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -3029,7 +3157,7 @@ static int raid1_reshape(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
-	lower_barrier(conf);
+	lower_barrier(conf, 1);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -3047,10 +3175,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
 		wake_up(&conf->wait_barrier);
 		break;
 	case 1:
-		raise_barrier(conf);
+		raise_barrier(conf, 1);
 		break;
 	case 0:
-		lower_barrier(conf);
+		lower_barrier(conf, 1);
 		break;
 	}
 }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715..6611c17 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,6 +1,7 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+
 struct raid1_info {
 	struct md_rdev	*rdev;
 	sector_t	head_position;
@@ -62,9 +63,15 @@ struct r1conf {
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
 	int			nr_pending;
+	int			nr_before;
+	int			nr_after;
 	int			nr_waiting;
 	int			nr_queued;
 	int			barrier;
+	int			force_barrier;
+	/*normal IO which the nearest resync window*/
+	sector_t		near_position;
+	struct rb_root		rb_root;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
 	 * Cleared when a sync completes.
@@ -104,6 +111,7 @@ struct r1conf {
  */
 
 struct r1bio {
+	struct rb_node		node;
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
@@ -158,6 +166,9 @@ struct r1bio {
 #define	R1BIO_MadeGood 7
 #define	R1BIO_WriteError 8
 
+#define R1BIO_BEFORE_RESYNC 9
+#define R1BIO_AFTER_RESYNC  10
+
 extern int md_raid1_congested(struct mddev *mddev, int bits);
 
 #endif
-- 
1.7.9.5
?韬{.n?????%??檩??w?{.n???{炳盯w???塄}?财??j:+v??????2??璀??摺?囤??z夸z罐?+?????w棹f