For a full stripe write, the ideal operation order is handle_stripe_dirtying(), raid_run_ops(), set R5_Wantwrite bit, and ops_run_io(). In this way, one handle_stripe() will dispatch IO for the stripe, otherwise there are more extra rounds of handle_stripe(). In a high speed raid5 array, handle_stripe() consumes considered cpu time. Reducing its overhead has around 10% performance boost. This patch makes handle_stripe() operations follow the ideal order. It also moves handle_stripe_clean_event() up, as it handles completed stripe. And if I don't change handle_stripe_clean_event() order, I saw some states confused with other changes. Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx> --- drivers/md/raid5.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) Index: linux/drivers/md/raid5.c =================================================================== --- linux.orig/drivers/md/raid5.c 2014-05-06 17:19:13.868225752 +0800 +++ linux/drivers/md/raid5.c 2014-05-06 17:20:25.367326852 +0800 @@ -1633,7 +1633,7 @@ static void ops_run_check_pq(struct stri &sh->ops.zero_sum_result, percpu->spare_page, &submit); } -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid_run_ops(struct stripe_head *sh, unsigned long *ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -1644,12 +1644,12 @@ static void raid_run_ops(struct stripe_h cpu = get_cpu(); percpu = per_cpu_ptr(conf->percpu, cpu); - if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + if (test_and_clear_bit(STRIPE_OP_BIOFILL, ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + if (test_and_clear_bit(STRIPE_OP_COMPUTE_BLK, ops_request)) { if (level < 6) tx = ops_run_compute5(sh, percpu); else { @@ -1659,26 +1659,26 @@ static void raid_run_ops(struct stripe_h tx = ops_run_compute6_2(sh, percpu); } /* terminate the chain if reconstruct is not set to be run */ - if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) + if (tx && !test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) + if (test_and_clear_bit(STRIPE_OP_PREXOR, ops_request)) tx = ops_run_prexor(sh, percpu, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + if (test_and_clear_bit(STRIPE_OP_BIODRAIN, ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) { if (level < 6) ops_run_reconstruct5(sh, percpu, tx); else ops_run_reconstruct6(sh, percpu, tx); } - if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (test_and_clear_bit(STRIPE_OP_CHECK, ops_request)) { if (sh->check_state == check_state_run) ops_run_check_p(sh, percpu); else if (sh->check_state == check_state_run_q) @@ -3780,6 +3780,41 @@ static void handle_stripe(struct stripe_ handle_failed_sync(conf, sh, &s); } + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying(conf, sh, &s, disks); + + if (s.ops_request) + raid_run_ops(sh, &s.ops_request); + /* Now we check to see if any write operations have recently * completed */ @@ -3817,29 +3852,6 @@ static void handle_stripe(struct stripe_ s.dec_preread_active = 1; } - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && (test_bit(R5_UPTODATE, &pdev->flags) || - test_bit(R5_Discard, &pdev->flags))))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && (test_bit(R5_UPTODATE, &qdev->flags) || - test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); - /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. @@ -3851,15 +3863,6 @@ static void handle_stripe(struct stripe_ || s.expanding) handle_stripe_fill(sh, &s, disks); - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+xor) is already in flight. - * 2/ A 'check' operation is in flight, as it may clobber the parity - * block. - */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying(conf, sh, &s, disks); - /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough * data is available. The parity check is held off while parity @@ -4014,7 +4017,7 @@ finish: } if (s.ops_request) - raid_run_ops(sh, s.ops_request); + raid_run_ops(sh, &s.ops_request); ops_run_io(sh, &s); -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html