[PATCH V1] raid5: Correct some failed-stripe because the badsector.

majianpeng <majianpeng@xxxxxxxxx> · Mon, 14 Jan 2013 16:44:49 +0800

At present for failed-stripes,the write/read operations will only return error.But there
are some failed-stripes which can correct by write operation.

For example:
1:Create a raid5 by four disks missing/sdb/sdc/sdd.
2:If readed  data from sdb and met error,so the stripe is failed.Because
the stripe was degraded,there is no chance to correct the read-error.
3:If later write-stripe contains sdb which overwrite.The stripe has a
chance to correct.
4:If step3 successed, add a spare disk to raid,the raid can be active
from degraded.

There are two methods to compute raid5 parity,rcw and rmw.
In this situation,it only used rcw.
There are some conditions for correcting failed-stripe
A: in a stripe there are only one or two baddisks(maybe faulty/removed/read-error/write-error).
B: in a stripe for baddisks exclude parity disks,they must overwrite.
C:After writing data,for read-error disks it must reread to check.


Tested-by: Qi Zhou <qi.g.zhou@xxxxxxxxx>
Signed-off-by: Yu Bian <ycbzzjlbyby@xxxxxxxxx>
Signed-off-by: Jianpeng Ma <majianpeng@xxxxxxxxx>
---
 drivers/md/raid5.c |  133 +++++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/raid5.h |    9 ++++
 2 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19d77a0..fd8fe58 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -224,6 +224,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			list_add_tail(&sh->lru, &conf->inactive_list);
 			wake_up(&conf->wait_for_stripe);
@@ -1800,6 +1801,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
@@ -1855,6 +1857,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		else {
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			clear_bit(R5_FailedReread, &sh->dev[i].flags);
 			if (!(set_bad
 			      && test_bit(In_sync, &rdev->flags)
 			      && rdev_set_badblocks(
@@ -1916,6 +1919,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		if (!uptodate) {
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
+			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
 				set_bit(MD_RECOVERY_NEEDED,
 					&rdev->mddev->recovery);
@@ -2523,6 +2527,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (bi)
 			bitmap_end = 1;
 
+		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
+			s->failed_overwrite--;
+		clear_bit(R5_BadParity, &sh->dev[i].flags);
+
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
@@ -2838,6 +2846,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
 			 conf->max_degraded, (unsigned long long)recovery_cp,
 			 (unsigned long long)sh->sector);
+	} else if (s->failed > conf->max_degraded) {
+		/*For write failed-stripe, only use rcw */
+		rcw = 1;
+		rmw = 2;
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
@@ -2900,8 +2912,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
-				if (!test_bit(R5_Insync, &dev->flags))
+				if (!test_bit(R5_Insync, &dev->flags)) {
+				 /* Only for all data-disks !R5_Insync */
+					if (s->failed > conf->max_degraded &&
+						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
+						set_bit(STRIPE_DELAYED, &sh->state);
 					continue; /* it's a failed drive */
+				}
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 					pr_debug("Read_old block "
@@ -3347,9 +3364,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			}
 		}
 		clear_bit(R5_Insync, &dev->flags);
-		if (!rdev)
-			/* Not in-sync */;
-		else if (is_bad) {
+		if (!rdev) {
+			s->noworked++;
+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags)) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			}
+		} else if (is_bad) {
 			/* also not in-sync */
 			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
 			    test_bit(R5_UPTODATE, &dev->flags)) {
@@ -3359,6 +3381,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(R5_Insync, &dev->flags);
 				set_bit(R5_ReadError, &dev->flags);
 			}
+			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
+				test_bit(R5_FailedOverwrite, &dev->flags))) {
+				s->failed_overwrite++;
+				set_bit(R5_FailedOverwrite, &dev->flags);
+			} else if (i == sh->pd_idx || i == sh->qd_idx)
+				set_bit(R5_BadParity, &dev->flags);
 		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
 		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
@@ -3410,14 +3439,36 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
-		if (test_bit(R5_ReadError, &dev->flags))
-			clear_bit(R5_Insync, &dev->flags);
+		if (test_bit(R5_ReadError, &dev->flags)) {
+			/*
+			 * For failed-stripe which contained badsectors,after writing
+			 * it should reread to check.
+			 */
+			struct md_rdev *rdev2 = rcu_dereference(conf->disks[i].rdev);
+			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
+				test_bit(R5_UPTODATE, &dev->flags) &&
+				(test_bit(R5_FailedOverwrite, &dev->flags) ||
+				 test_bit(R5_BadParity, &dev->flags) ||
+				 test_bit(R5_FailedReread, &dev->flags))) {
+				set_bit(STRIPE_FAILED_REREAD, &sh->state);
+				set_bit(R5_FailedReread, &dev->flags);
+				clear_bit(R5_FailedOverwrite, &dev->flags);
+				clear_bit(R5_BadParity, &dev->flags);
+			} else
+				clear_bit(R5_Insync, &dev->flags);
+		}
+
 		if (!test_bit(R5_Insync, &dev->flags)) {
 			if (s->failed < 2)
 				s->failed_num[s->failed] = i;
 			s->failed++;
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
+			if (i == sh->pd_idx)
+				s->p_failed = 1;
+			else if (i == sh->qd_idx)
+				s->q_failed = 1;
+
 		}
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -3492,19 +3543,46 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
-	       " to_write=%d failed=%d failed_num=%d,%d\n",
+		" to_write=%d failed=%d failed_num=%d,%d"
+		" failed_overwrite=%d p_failed=%d q_failed=%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
-	       s.failed_num[0], s.failed_num[1]);
+	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
+	       s.p_failed, s.q_failed);
+
 	/* check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
 	 */
 	if (s.failed > conf->max_degraded) {
-		sh->check_state = 0;
-		sh->reconstruct_state = 0;
-		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-		if (s.syncing + s.replacing)
-			handle_failed_sync(conf, sh, &s);
+		/*
+		 *Disks which removed or faulty must less or equal max_degraded
+		 */
+		 if (s.noworked <= conf->max_degraded &&
+			((conf->level < 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed)) ||
+			 (conf->level == 6 &&
+			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
+			set_bit(STRIPE_FAILED_WRITE, &sh->state);
+			pr_debug("stripe(%llu) failed, try to recorrect\n",
+				(unsigned long long)sh->sector);
+		} else if ((s.noworked <= conf->max_degraded &&
+			s.failed_overwrite > 0 &&
+			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
+			pr_debug("stripe(%lld) delay for failed-write\n",
+			(unsigned long long)sh->sector);
+		 else {
+			sh->check_state = 0;
+			sh->reconstruct_state = 0;
+			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
+			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
+			if (s.to_read+s.to_write+s.written) {
+				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+				pr_debug("handle_failed_stripe:%lld\n",
+					(unsigned long long)sh->sector);
+			}
+			if (s.syncing + s.replacing)
+				 handle_failed_sync(conf, sh, &s);
+		}
+
 	}
 
 	/* Now we check to see if any write operations have recently
@@ -3525,6 +3603,7 @@ static void handle_stripe(struct stripe_head *sh)
 		BUG_ON(sh->qd_idx >= 0 &&
 		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
 		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
+		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3620,10 +3699,30 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
 
-	/* If the failed drives are just a ReadError, then we might need
-	 * to progress the repair/check process
+	/*
+	 * If stripe failed because badsector and
+	 * can write to correct the badsector
+	 * we used readerr routine which rewrite and reread
 	 */
-	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_FailedReread, &dev->flags)
+				&& !test_bit(R5_LOCKED, &dev->flags)
+				&& test_bit(R5_UPTODATE, &dev->flags)) {
+				/* it just wrote and sucessed,so not
+				 * necessary to rewrite,only set flag
+				 */
+				set_bit(R5_ReWrite, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				set_bit(R5_LOCKED, &dev->flags);
+				s.locked++;
+				}
+		}
+	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+		/* If the failed drives are just a ReadError, then we might need
+		 * to progress the repair/check process
+		 */
 		for (i = 0; i < s.failed; i++) {
 			struct r5dev *dev = &sh->dev[s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a..72fd934 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -251,6 +251,10 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	/* Overwrite disks which faulty or removed or contained badsector */
+	int failed_overwrite;
+	int noworked; /*disk faulty or removed*/
+
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -299,6 +303,9 @@ enum r5dev_flags {
 			 * data in, and now is a good time to write it out.
 			 */
 	R5_Discard,	/* Discard the stripe */
+	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
+	R5_FailedReread,
+	R5_BadParity,
 };
 
 /*
@@ -323,6 +330,8 @@ enum {
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
+	STRIPE_FAILED_WRITE,
+	STRIPE_FAILED_REREAD,
 };
 
 /*
-- 
1.7.9.5
?韬{.n?????%??檩??w?{.n???{炳盯w???塄}?财??j:+v??????2??璀??摺?囤??z夸z罐?+?????w棹f