Re: Re: [PATCH V1] raid5: Correct some failed-stripe because the badsector.

majianpeng <majianpeng@xxxxxxxxx> · Tue, 19 Feb 2013 14:54:19 +0800

>On Mon, 14 Jan 2013 16:44:49 +0800 majianpeng <majianpeng@xxxxxxxxx> wrote:
>
>> At present for failed-stripes,the write/read operations will only return error.But there
>> are some failed-stripes which can correct by write operation.
>> 
>> For example:
>> 1:Create a raid5 by four disks missing/sdb/sdc/sdd.
>> 2:If readed  data from sdb and met error,so the stripe is failed.Because
>> the stripe was degraded,there is no chance to correct the read-error.
>> 3:If later write-stripe contains sdb which overwrite.The stripe has a
>> chance to correct.
>> 4:If step3 successed, add a spare disk to raid,the raid can be active
>> from degraded.
>> 
>> There are two methods to compute raid5 parity,rcw and rmw.
>> In this situation,it only used rcw.
>> There are some conditions for correcting failed-stripe
>> A: in a stripe there are only one or two baddisks(maybe faulty/removed/read-error/write-error).
>> B: in a stripe for baddisks exclude parity disks,they must overwrite.
>> C:After writing data,for read-error disks it must reread to check.
>> 
>> 
>> Tested-by: Qi Zhou <qi.g.zhou@xxxxxxxxx>
>> Signed-off-by: Yu Bian <ycbzzjlbyby@xxxxxxxxx>
>> Signed-off-by: Jianpeng Ma <majianpeng@xxxxxxxxx>
>> ---
>>  drivers/md/raid5.c |  133 +++++++++++++++++++++++++++++++++++++++++++++-------
>>  drivers/md/raid5.h |    9 ++++
>>  2 files changed, 125 insertions(+), 17 deletions(-)
>> 
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 19d77a0..fd8fe58 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -224,6 +224,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
>>  			    < IO_THRESHOLD)
>>  				md_wakeup_thread(conf->mddev->thread);
>>  		atomic_dec(&conf->active_stripes);
>> +		clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>>  		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
>>  			list_add_tail(&sh->lru, &conf->inactive_list);
>>  			wake_up(&conf->wait_for_stripe);
>> @@ -1800,6 +1801,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>>  
>> @@ -1855,6 +1857,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>>  		else {
>>  			clear_bit(R5_ReadError, &sh->dev[i].flags);
>>  			clear_bit(R5_ReWrite, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedReread, &sh->dev[i].flags);
>>  			if (!(set_bad
>>  			      && test_bit(In_sync, &rdev->flags)
>>  			      && rdev_set_badblocks(
>> @@ -1916,6 +1919,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
>>  		if (!uptodate) {
>>  			set_bit(WriteErrorSeen, &rdev->flags);
>>  			set_bit(R5_WriteError, &sh->dev[i].flags);
>> +			clear_bit(R5_FailedOverwrite, &sh->dev[i].flags);
>>  			if (!test_and_set_bit(WantReplacement, &rdev->flags))
>>  				set_bit(MD_RECOVERY_NEEDED,
>>  					&rdev->mddev->recovery);
>> @@ -2523,6 +2527,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>>  		if (bi)
>>  			bitmap_end = 1;
>>  
>> +		if (test_and_clear_bit(R5_FailedOverwrite, &sh->dev[i].flags))
>> +			s->failed_overwrite--;
>> +		clear_bit(R5_BadParity, &sh->dev[i].flags);
>> +
>>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>>  			wake_up(&conf->wait_for_overlap);
>>  
>> @@ -2838,6 +2846,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
>>  			 conf->max_degraded, (unsigned long long)recovery_cp,
>>  			 (unsigned long long)sh->sector);
>> +	} else if (s->failed > conf->max_degraded) {
>> +		/*For write failed-stripe, only use rcw */
>> +		rcw = 1;
>> +		rmw = 2;
>>  	} else for (i = disks; i--; ) {
>>  		/* would I have to read this buffer for read_modify_write */
>>  		struct r5dev *dev = &sh->dev[i];
>> @@ -2900,8 +2912,13 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>>  				rcw++;
>> -				if (!test_bit(R5_Insync, &dev->flags))
>> +				if (!test_bit(R5_Insync, &dev->flags)) {
>> +				 /* Only for all data-disks !R5_Insync */
>> +					if (s->failed > conf->max_degraded &&
>> +						!test_bit(STRIPE_FAILED_WRITE, &sh->state))
>> +						set_bit(STRIPE_DELAYED, &sh->state);
>>  					continue; /* it's a failed drive */
>> +				}
>>  				if (
>>  				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
>>  					pr_debug("Read_old block "
>> @@ -3347,9 +3364,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			}
>>  		}
>>  		clear_bit(R5_Insync, &dev->flags);
>> -		if (!rdev)
>> -			/* Not in-sync */;
>> -		else if (is_bad) {
>> +		if (!rdev) {
>> +			s->noworked++;
>> +			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags)) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			}
>> +		} else if (is_bad) {
>>  			/* also not in-sync */
>>  			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>>  			    test_bit(R5_UPTODATE, &dev->flags)) {
>> @@ -3359,6 +3381,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  				set_bit(R5_Insync, &dev->flags);
>>  				set_bit(R5_ReadError, &dev->flags);
>>  			}
>> +			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
>> +				((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>> +				test_bit(R5_FailedOverwrite, &dev->flags))) {
>> +				s->failed_overwrite++;
>> +				set_bit(R5_FailedOverwrite, &dev->flags);
>> +			} else if (i == sh->pd_idx || i == sh->qd_idx)
>> +				set_bit(R5_BadParity, &dev->flags);
>>  		} else if (test_bit(In_sync, &rdev->flags))
>>  			set_bit(R5_Insync, &dev->flags);
>>  		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
>> @@ -3410,14 +3439,36 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  			clear_bit(R5_ReadError, &dev->flags);
>>  			clear_bit(R5_ReWrite, &dev->flags);
>>  		}
>> -		if (test_bit(R5_ReadError, &dev->flags))
>> -			clear_bit(R5_Insync, &dev->flags);
>> +		if (test_bit(R5_ReadError, &dev->flags)) {
>> +			/*
>> +			 * For failed-stripe which contained badsectors,after writing
>> +			 * it should reread to check.
>> +			 */
>> +			struct md_rdev *rdev2 = rcu_dereference(conf->disks[i].rdev);
>> +			if (!is_bad && rdev2 && !test_bit(Faulty, &rdev2->flags) &&
>> +				test_bit(R5_UPTODATE, &dev->flags) &&
>> +				(test_bit(R5_FailedOverwrite, &dev->flags) ||
>> +				 test_bit(R5_BadParity, &dev->flags) ||
>> +				 test_bit(R5_FailedReread, &dev->flags))) {
>> +				set_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +				set_bit(R5_FailedReread, &dev->flags);
>> +				clear_bit(R5_FailedOverwrite, &dev->flags);
>> +				clear_bit(R5_BadParity, &dev->flags);
>> +			} else
>> +				clear_bit(R5_Insync, &dev->flags);
>> +		}
>> +
>>  		if (!test_bit(R5_Insync, &dev->flags)) {
>>  			if (s->failed < 2)
>>  				s->failed_num[s->failed] = i;
>>  			s->failed++;
>>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>>  				do_recovery = 1;
>> +			if (i == sh->pd_idx)
>> +				s->p_failed = 1;
>> +			else if (i == sh->qd_idx)
>> +				s->q_failed = 1;
>> +
>>  		}
>>  	}
>>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
>> @@ -3492,19 +3543,46 @@ static void handle_stripe(struct stripe_head *sh)
>>  	}
>>  
>>  	pr_debug("locked=%d uptodate=%d to_read=%d"
>> -	       " to_write=%d failed=%d failed_num=%d,%d\n",
>> +		" to_write=%d failed=%d failed_num=%d,%d"
>> +		" failed_overwrite=%d p_failed=%d q_failed=%d\n",
>>  	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
>> -	       s.failed_num[0], s.failed_num[1]);
>> +	       s.failed_num[0], s.failed_num[1], s.failed_overwrite,
>> +	       s.p_failed, s.q_failed);
>> +
>>  	/* check if the array has lost more than max_degraded devices and,
>>  	 * if so, some requests might need to be failed.
>>  	 */
>>  	if (s.failed > conf->max_degraded) {
>> -		sh->check_state = 0;
>> -		sh->reconstruct_state = 0;
>> -		if (s.to_read+s.to_write+s.written)
>> -			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> -		if (s.syncing + s.replacing)
>> -			handle_failed_sync(conf, sh, &s);
>> +		/*
>> +		 *Disks which removed or faulty must less or equal max_degraded
>> +		 */
>> +		 if (s.noworked <= conf->max_degraded &&
>> +			((conf->level < 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed)) ||
>> +			 (conf->level == 6 &&
>> +			(s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) {
>> +			set_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			pr_debug("stripe(%llu) failed, try to recorrect\n",
>> +				(unsigned long long)sh->sector);
>> +		} else if ((s.noworked <= conf->max_degraded &&
>> +			s.failed_overwrite > 0 &&
>> +			!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
>> +			pr_debug("stripe(%lld) delay for failed-write\n",
>> +			(unsigned long long)sh->sector);
>> +		 else {
>> +			sh->check_state = 0;
>> +			sh->reconstruct_state = 0;
>> +			clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>> +			clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>> +			if (s.to_read+s.to_write+s.written) {
>> +				handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>> +				pr_debug("handle_failed_stripe:%lld\n",
>> +					(unsigned long long)sh->sector);
>> +			}
>> +			if (s.syncing + s.replacing)
>> +				 handle_failed_sync(conf, sh, &s);
>> +		}
>> +
>>  	}
>>  
>>  	/* Now we check to see if any write operations have recently
>> @@ -3525,6 +3603,7 @@ static void handle_stripe(struct stripe_head *sh)
>>  		BUG_ON(sh->qd_idx >= 0 &&
>>  		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
>>  		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
>> +		clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>>  		for (i = disks; i--; ) {
>>  			struct r5dev *dev = &sh->dev[i];
>>  			if (test_bit(R5_LOCKED, &dev->flags) &&
>> @@ -3620,10 +3699,30 @@ static void handle_stripe(struct stripe_head *sh)
>>  		clear_bit(STRIPE_SYNCING, &sh->state);
>>  	}
>>  
>> -	/* If the failed drives are just a ReadError, then we might need
>> -	 * to progress the repair/check process
>> +	/*
>> +	 * If stripe failed because badsector and
>> +	 * can write to correct the badsector
>> +	 * we used readerr routine which rewrite and reread
>>  	 */
>> -	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +	if (test_bit(STRIPE_FAILED_REREAD, &sh->state)) {
>> +		for (i = disks; i--; ) {
>> +			struct r5dev *dev = &sh->dev[i];
>> +			if (test_bit(R5_FailedReread, &dev->flags)
>> +				&& !test_bit(R5_LOCKED, &dev->flags)
>> +				&& test_bit(R5_UPTODATE, &dev->flags)) {
>> +				/* it just wrote and sucessed,so not
>> +				 * necessary to rewrite,only set flag
>> +				 */
>> +				set_bit(R5_ReWrite, &dev->flags);
>> +				set_bit(R5_Wantread, &dev->flags);
>> +				set_bit(R5_LOCKED, &dev->flags);
>> +				s.locked++;
>> +				}
>> +		}
>> +	} else if (s.failed <= conf->max_degraded && !conf->mddev->ro)
>> +		/* If the failed drives are just a ReadError, then we might need
>> +		 * to progress the repair/check process
>> +		 */
>>  		for (i = 0; i < s.failed; i++) {
>>  			struct r5dev *dev = &sh->dev[s.failed_num[i]];
>>  			if (test_bit(R5_ReadError, &dev->flags)
>> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
>> index 18b2c4a..72fd934 100644
>> --- a/drivers/md/raid5.h
>> +++ b/drivers/md/raid5.h
>> @@ -251,6 +251,10 @@ struct stripe_head_state {
>>  	 */
>>  	int syncing, expanding, expanded, replacing;
>>  	int locked, uptodate, to_read, to_write, failed, written;
>> +	/* Overwrite disks which faulty or removed or contained badsector */
>> +	int failed_overwrite;
>> +	int noworked; /*disk faulty or removed*/
>> +
>>  	int to_fill, compute, req_compute, non_overwrite;
>>  	int failed_num[2];
>>  	int p_failed, q_failed;
>> @@ -299,6 +303,9 @@ enum r5dev_flags {
>>  			 * data in, and now is a good time to write it out.
>>  			 */
>>  	R5_Discard,	/* Discard the stripe */
>> +	R5_FailedOverwrite, /*overwrite on disk which removed or faulty or had badsectors*/
>> +	R5_FailedReread,
>> +	R5_BadParity,
>>  };
>>  
>>  /*
>> @@ -323,6 +330,8 @@ enum {
>>  	STRIPE_COMPUTE_RUN,
>>  	STRIPE_OPS_REQ_PENDING,
>>  	STRIPE_ON_UNPLUG_LIST,
>> +	STRIPE_FAILED_WRITE,
>> +	STRIPE_FAILED_REREAD,
>>  };
>>  
>>  /*
>
>
>Thanks for resending with the irrelevant bits removed.
>
>Unfortunately I'm having a hard time following the logic, which means that if
>I apply the patch it will make the code (even more) unmaintainable.
>
Sorry for delay in replying.
>What I need is :
> - a clear description of what each flag really means
R5_FailedOverwrite:There are two conditions which can used this flag
a)disk which removed or had Faulty flag;  b)disk had badsectors on this stripe.
For those disks it can't read the data.So only overwrite can had a change to do continue.

R5_FailedReread: If disk had badsector and this stripe is failed,if it can do write success.Then it should to reread to check.
etc: raid5 four disks.A/B had badsector.
A B C D(p)
Without this patch, the write must faild.But if A/B are overwrite, so it read C to computer parity. And write A/B/D.
After the write, it should to reread A/B to check.

R5_BadParity: This flag only for parity disk when it had badsectors.When reread the code, i think this flag can remove.
>test_bit(R5_BadParity, &dev->flags)
it can be replaced by (i == sh->pd_idx || i == sh->qd_idx).

> - a clear description of what each new field in stripe_head_state means.
STRIPE_FAILED_WRITE: This flag means this operation is write on a failed stripe.
STRIPE_FAILED_REREAD:After the stripe-failed-write operation, it should reread the disks which had badsector to check the data.

> - a clear description of the sequencing of operations - when and why it
>   decided to take a new path to fix things, how it might decide that it
>   cannot after all and fail, etc.
For failed-stripe-write operation, there are at most two steps. One write,the other is reread if disks had badsectors.
For failed-stripe,there are three cases after analysing stripe state.
a): it can do write; b):now it can't,but it had a chance to do write.so it wait; c)it failed immediately.
So the case a) is the most important.
>        if (s.failed > conf->max_degraded) {
	1:First stripe was failed
>                /*   
>                 * Disks which removed or faulty must less or equal max_degraded
>                 */
>                if (s.noworked <= conf->max_degraded && 
  2:disks which removed or faulty must less or equal max_degraded. The aim of writing failed-stripe is to reuse this stripe.So if this condition was false, it makes no sense.
	This condition is also useful the case b.
>                        ((conf->level < 6 && 
>                         (s.failed - s.failed_overwrite <= s.p_failed)) || 
  3:For raid5,all failed disks which removed or had faulty flag or had badsectos must be overwrite,excepted parity disk. Only this condition is true, it can do rcw write.
>                         (conf->level == 6 && 
>                          (s.failed - s.failed_overwrite <= s.p_failed + s.q_failed)))) { 
4: For raid6, the difference is that it had two parity disks p and q.
>                        set_bit(STRIPE_FAILED_WRITE, &sh->state);
>                        pr_debug("stripe(%llu) failed, try to recorrect\n",
>                                (unsigned long long)sh->sector);
>                } else if ((s.noworked <= conf->max_degraded &&
>                        s.failed_overwrite > 0 && 
>                        !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)))
>                        pr_debug("stripe(%lld) delay for failed-write\n",
>                                        (unsigned long long)sh->sector);
	This is for case b which waitting for chance to do.
>			else {
>                        sh->check_state = 0; 
>                        sh->reconstruct_state = 0; 
>                        clear_bit(STRIPE_FAILED_WRITE, &sh->state);
>                        clear_bit(STRIPE_FAILED_REREAD, &sh->state);
>                        if (s.to_read+s.to_write+s.written) {
>                                handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
>                                pr_debug("handle_failed_stripe:%lld\n",
>                                        (unsigned long long)sh->sector);
>                        }    
>                        if (s.syncing + s.replacing)
>                                handle_failed_sync(conf, sh, &s); 
>                }
	There, it only failed.
>
>I find some constructs rather odd.  e.g.
>
>+		if (!rdev) {
>+			s->noworked++;
>+			if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags)) ||
>+			    test_bit(R5_FailedOverwrite, &dev->flags)) {
>+				s->failed_overwrite++;
>+				set_bit(R5_FailedOverwrite, &dev->flags);
>+			}
>
>I don't follow why the "test_bit(xxx)" is in the if clause.  when would it be
>set, but ->towrite and R5_OVERWRITE aren't set?
A stripe can be called multiple times by func handle_stripe.
The first it check the condition "if ((dev->towrite && test_bit(R5_OVERWRITE, &dev->flags))"
But after the successfuley write, ->towrite will be set null.So if there is no test_bit(xxx), it will be exec handle_failed_stripe/sync.
There are many places like those.How to different the first-check and successfuly write-check? 
Specifically, if there are some badsectors, it must to  reread.

>
>Maybe I'll try again another week, but for the moment, I'm just not making
>any headway in understanding you code - sorry.
>
>NeilBrown
>
>?韬{.n?????%??檩??w?{.n???{炳盯w???塄}?财??j:+v??????2??璀??摺?囤??z夸z罐?+?????w棹f