This patch adds 'compute block' capabilities to the work queue. Here are a few notes about the new flags R5_ComputeReq and STRIPE_OP_COMPUTE_Recover: Previously, when handle_stripe5 found a block that needed to be computed it updated it in the same step. Now that these operations are separated (across multiple calls to handle_stripe5), a R5_ComputeReq flag is needed to tell other parts of handle_stripe5 to treat the block under computation as if it were up to date. The order of events in the work queue ensures that the block is indeed up to date before performing further operations. STRIPE_OP_COMPUTE_Recover was added to track when the parity block is being computed due to a failed parity check. This allows the code in handle_stripe5 that produces requests for check_parity and compute_block operations to be separate from the code that consumes the result. Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> drivers/md/raid5.c | 147 +++++++++++++++++++++++++++++++++++++-------- include/linux/raid/raid5.h | 7 +- 2 files changed, 129 insertions(+), 25 deletions(-) =================================================================== Index: linux-2.6-raid/drivers/md/raid5.c =================================================================== --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 10:47:43.000000000 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 11:06:06.000000000 -0700 @@ -1263,7 +1263,9 @@ } } else { /* enter stage 1 of read modify write operation */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags))); + set_bit(STRIPE_OP_RMW, &sh->state); set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); for (i=disks ; i-- ;) { @@ -1272,7 +1274,8 @@ continue; if (dev->towrite && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_ComputeReq, &dev->flags))) { set_bit(R5_LOCKED, &dev->flags); locked++; } @@ -1331,6 +1334,30 @@ return work_queued; } +static int handle_compute_operations5(struct stripe_head *sh, int dd_idx) +{ + int work_queued = -EBUSY; + + if (test_bit(STRIPE_OP_COMPUTE, &sh->state) && + test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_COMPUTE, &sh->state); + clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state); + clear_bit(R5_ComputeReq, &sh->dev[dd_idx].flags); + work_queued = 0; + } else if (!test_bit(STRIPE_OP_COMPUTE, &sh->state)) { + set_bit(STRIPE_OP_COMPUTE, &sh->state); + set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state); + set_bit(R5_ComputeReq, &sh->dev[dd_idx].flags); + work_queued = 1; + sh->ops.pending++; + } + + PRINTK("%s: stripe %llu work_queued: %d op_state: %lx dev[%d].flags: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + work_queued, sh->ops.state, dd_idx, sh->dev[dd_idx].flags); + + return work_queued; +} /* * Each stripe/dev can have one or more bion attached. @@ -1454,7 +1481,7 @@ int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1; void *ptr[MAX_XOR_BLOCKS]; struct bio *chosen; - int overlap=0, work=0, written=0; + int overlap=0, work=0, written=0, compute=0, dd_idx=0; unsigned long state, ops_state, ops_state_orig; /* take a snapshot of what needs to be done at this point in time */ @@ -1463,6 +1490,51 @@ ops_state_orig = ops_state = sh->ops.state; spin_unlock(&sh->lock); + if (test_bit(STRIPE_OP_COMPUTE, &state)) { + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_ComputeReq, &dev->flags)) { + dd_idx = i; + i = -1; + break; + } + } + BUG_ON(i >= 0); + PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + ops_state, dd_idx); + ptr[0] = page_address(sh->dev[dd_idx].page); + + if (test_and_clear_bit(STRIPE_OP_COMPUTE_Prep, &ops_state)) { + memset(ptr[0], 0, STRIPE_SIZE); + set_bit(STRIPE_OP_COMPUTE_Parity, &ops_state); + } + + if (test_and_clear_bit(STRIPE_OP_COMPUTE_Parity, &ops_state)) { + for (i = disks ; i--; ) { + struct r5dev *dev = &sh->dev[i]; + void *p; + if (i == dd_idx) + continue; + p = page_address(dev->page); + if (test_bit(R5_UPTODATE, &dev->flags)) + ptr[count++] = p; + else + printk(KERN_ERR "STRIPE_OP_COMPUTE %d, stripe %llu, %d" + " not present\n", dd_idx, + (unsigned long long)sh->sector, i); + + check_xor(); + } + if (count != 1) + xor_block(count, STRIPE_SIZE, ptr); + + work++; + compute++; + set_bit(STRIPE_OP_COMPUTE_Done, &ops_state); + } + } + if (test_bit(STRIPE_OP_RMW, &state)) { BUG_ON(test_bit(STRIPE_OP_RCW, &state)); PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n", @@ -1615,6 +1687,9 @@ wake_up(&sh->raid_conf->wait_for_overlap); } + if (compute) + set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + sh->ops.pending -= work; clear_bit(STRIPE_OP_QUEUED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -1857,25 +1932,32 @@ * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { + if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding || + test_bit(STRIPE_OP_COMPUTE, &sh->state)) { for (i=disks; i--;) { dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || + if (test_bit(R5_ComputeReq, &dev->flags) || + (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || expanding || (failed && (sh->dev[failed_num].toread || (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) - ) + )) ) { /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (uptodate == disks-1) { - PRINTK("Computing block %d\n", i); - compute_block(sh, i); - uptodate++; + if ((uptodate == disks-1) || test_bit(STRIPE_OP_COMPUTE, &sh->state)) { + handle_compute_operations5(sh, i); + if (uptodate == disks-1) + uptodate++; + /* Careful: from this point on 'uptodate' is in the eye of the + * workqueue which services 'compute' operations before writes + * and parity checks. R5_ComputeReq flags blocks that will be + * R5_UPTODATE in the work queue. + */ } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); @@ -1898,8 +1980,7 @@ } /* now to consider writing and what else, if anything should be read */ - if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) || - test_bit(STRIPE_OP_RMW, &sh->state)) { + if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) || test_bit(STRIPE_OP_RMW, &sh->state)) { int rmw=0, rcw=0; for (i=disks ; i--;) { /* would I have to read this buffer for read_modify_write */ @@ -1910,7 +1991,7 @@ || sh->bh_page[i]!=bh->b_page #endif ) && - !test_bit(R5_UPTODATE, &dev->flags)) { + !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags) /* && !(!mddev->insync && i == sh->pd_idx) */ ) @@ -1924,7 +2005,7 @@ || sh->bh_page[i] != bh->b_page #endif ) && - !test_bit(R5_UPTODATE, &dev->flags)) { + !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } @@ -1937,7 +2018,8 @@ for (i=disks; i--;) { dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && - !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -1956,7 +2038,8 @@ for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && - !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_ComputeReq, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -2007,12 +2090,20 @@ int work_queued = 0, result = 0; set_bit(STRIPE_HANDLE, &sh->state); - if (failed == 0) { + /* Take one of the following actions: + * 1/ start a check parity operation if (uptodate == disks) + * 2/ finish a check parity operation and act on the result + * 3/ skip this section if we previously initiated a recovery + * operation + */ + if (failed == 0 && !test_bit(STRIPE_OP_COMPUTE_Recover, &sh->ops.state)) { BUG_ON(!test_bit(STRIPE_OP_CHECK, &sh->state) && (uptodate != disks)); work_queued = handle_check_operations5(sh, uptodate == disks); - result = test_and_clear_bit(STRIPE_OP_CHECK_IsZero, &sh->ops.state); + if (work_queued == 0) + result = test_and_clear_bit(STRIPE_OP_CHECK_IsZero, + &sh->ops.state); if (work_queued > 0) { uptodate--; } else if (result && work_queued == 0) { @@ -2024,15 +2115,25 @@ /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - uptodate++; + set_bit(STRIPE_OP_COMPUTE_Recover, &sh->ops.state); + handle_compute_operations5(sh, sh->pd_idx); + if (uptodate == disks-1) + uptodate++; } } } - if (!test_bit(STRIPE_INSYNC, &sh->state) && work_queued == 0) { + /* Wait for check parity and compute block operations to complete + * before write-back + */ + if (!test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->state) && + !test_bit(STRIPE_OP_COMPUTE, &sh->state)) { /* either failed parity check, or recovery is happening */ - if (failed==0) + if (failed==0) { + BUG_ON(!test_and_clear_bit( + STRIPE_OP_COMPUTE_Recover, &sh->ops.state)); failed_num = sh->pd_idx; + } dev = &sh->dev[failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(uptodate != disks); Index: linux-2.6-raid/include/linux/raid/raid5.h =================================================================== --- linux-2.6-raid.orig/include/linux/raid/raid5.h 2006-06-28 10:47:43.000000000 -0700 +++ linux-2.6-raid/include/linux/raid/raid5.h 2006-06-28 11:05:38.000000000 -0700 @@ -179,6 +179,7 @@ #define R5_ReWrite 9 /* have tried to over-write the readerror */ #define R5_Expanded 10 /* This block now has post-expand data */ #define R5_Consistent 11 /* Block is HW DMA-able without a cache flush */ +#define R5_ComputeReq 12 /* compute_block in progress treat as uptodate */ /* * Write method @@ -238,15 +239,17 @@ #define STRIPE_OP_COMPUTE_Parity 15 #define STRIPE_OP_COMPUTE_End 16 #define STRIPE_OP_COMPUTE_Done 17 +#define STRIPE_OP_COMPUTE_Recover 18 /* - * Bit mask for status bits set by the work queue thread + * Bit mask for status bits not to be cleared by the work queue thread */ #define STRIPE_OP_COMPLETION_MASK (1 << STRIPE_OP_RCW_Done |\ 1 << STRIPE_OP_RMW_Done |\ 1 << STRIPE_OP_CHECK_Done |\ 1 << STRIPE_OP_CHECK_IsZero |\ - 1 << STRIPE_OP_COMPUTE_Done) + 1 << STRIPE_OP_COMPUTE_Done |\ + 1 << STRIPE_OP_COMPUTE_Recover) /* * Plugging: * - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html