Scheduling and processing the asynchronous computations. handle_stripe will compute a block when a backing disk has failed. Since both RAID-5/6 use the same ops_complete_compute() we should set the second computation target in RAID-5 to (-1) [no target]. Signed-off-by: Yuri Tikhonov <yur@xxxxxxxxxxx> Signed-off-by: Mikhail Cherkashin <mike@xxxxxxxxxxx> -- diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3e8f896..f0f8d7f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2770,6 +2770,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no second target */ s->req_compute = 1; sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye @@ -2830,63 +2831,138 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +/* __handle_issuing_new_read_requests6 - returns 0 if there are no more disks + * to process + */ +static int __handle_issuing_new_read_requests6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, - int disks) + int disk_idx, int disks) { - int i; struct stripe_queue *sq = sh->sq; + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5_queue_dev *dev_q = &sq->dev[disk_idx]; + struct r5dev *failed_dev[2] = { &sh->dev[r6s->failed_num[0]], + &sh->dev[r6s->failed_num[1]]}; + struct r5_queue_dev *failed_dev_q[2] = { &sq->dev[r6s->failed_num[0]], + &sq->dev[r6s->failed_num[1]]}; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - struct r5_queue_dev *dev_q = &sq->dev[i]; + /* don't schedule compute operations or reads on + * the parity blocks while a check is in flight + */ + if ((disk_idx == sq->pd_idx || disk_idx == r6s->qd_idx) && + test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + return ~0; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev_q->toread || (dev_q->towrite && - !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed >= 1 && - (sq->dev[r6s->failed_num[0]].toread || - s->to_write)) || - (s->failed >= 2 && - (sq->dev[r6s->failed_num[1]].toread || - s->to_write)))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && (dev_q->toread || + (dev_q->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed >= 1 && (failed_dev_q[0]->toread || + (failed_dev_q[0]->towrite && + !test_bit(R5_OVERWRITE,&failed_dev[0]->flags)))) || + (s->failed >= 2 && (failed_dev_q[1]->toread || + (failed_dev_q[1]->towrite && + !test_bit(R5_OVERWRITE,&failed_dev[1]->flags)))) + )) { + /* 1/ We would like to get this block, possibly + * by computing it, but we might not be able to. + * + * 2/ Since parity check operations potentially + * make the parity block !uptodate it will need + * to be refreshed before any compute operations + * on data disks are scheduled. + * + * 3/ We hold off parity blocks re-reads until check + * operations have quiesced. + */ + if ((s->uptodate == disks-1) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no second target */ + s->req_compute = 1; + sh->ops.count++; + /* Careful: from this point on 'uptodate' is in the eye of + * raid_run_ops which services 'compute' operations before + * writes. R5_Wantcompute flags a block that will be R5_UPTODATE + * by the time it is needed for a subsequent operation. */ - if (s->uptodate == disks-1) { - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - s->uptodate++; - } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == i) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - i, other); - compute_block_2(sh, i, other); - s->uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); + s->uptodate++; + return 0; /* s->uptodate + s->compute == disks */ + } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, &sh->dev[other].flags)) + break; } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, &dev->flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->req_compute = 1; + sh->ops.count++; + s->uptodate += 2; + } else if ((s->uptodate < disks-2) && + test_bit(R5_Insync, &dev->flags)) { + /* Note: we hold off compute operations while checks + * are in flight, but we still prefer 'compute' over 'read' + * hence we only read if (uptodate < disks-1) FIXME + */ + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", disk_idx, + s->syncing); } } + + return ~0; +} + +static void handle_issuing_new_read_requests6(struct stripe_head *sh, + struct stripe_head_state *s, struct r6_state *r6s, + int disks) +{ + int i; + + /* Clear completed compute operations. Parity recovery + * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled + * later on in this routine + */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i = disks; i--;) + if (!__handle_issuing_new_read_requests6(sh, s, r6s, + i, disks)) + break; + } set_bit(STRIPE_HANDLE, &sh->state); } @@ -3079,11 +3155,11 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != pd_idx && i != qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + !test_bit(R5_Wantcompute, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else { pr_debug("raid6: must_compute: " @@ -3100,18 +3176,19 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, /* want reconstruct write, but need to get some data */ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && + if (!(!test_bit(R5_OVERWRITE, &dev->flags) && + !(s->failed == 0 && (i == pd_idx || i == qd_idx)) && + !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } + !test_bit(R5_Wantcompute, &dev->flags) && + test_bit(R5_Insync, &dev->flags))) + continue; + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; } /* now if nothing is locked, and if we have enough data, we can start a * write request @@ -3131,13 +3208,26 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, case 0: BUG(); case 1: - compute_block_1(sh, r6s->failed_num[0], 0); + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, + &sh->dev[r6s->failed_num[0]].flags); + sh->ops.target = r6s->failed_num[0]; + sh->ops.target2 = -1; /* no second target */ + s->req_compute = 1; + sh->ops.count++; break; case 2: - compute_block_2(sh, r6s->failed_num[0], - r6s->failed_num[1]); + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, + &sh->dev[r6s->failed_num[0]].flags); + set_bit(R5_Wantcompute, + &sh->dev[r6s->failed_num[1]].flags); + sh->ops.target = r6s->failed_num[0]; + sh->ops.target2 = r6s->failed_num[1]; + s->req_compute = 1; + sh->ops.count++; break; - default: /* This request should have been failed? */ + default: BUG(); } } @@ -3737,6 +3827,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) + BUG_ON(++s.compute > 2); if (dev_q->toread) s.to_read++; @@ -3803,7 +3895,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate < disks)) || s.expanding) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_issuing_new_read_requests6(sh, &s, &r6s, disks); /* Now we check to see if any write operations have recently -- Yuri Tikhonov, Senior Software Engineer Emcraft Systems, www.emcraft.com - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html