From: Dan Williams <dan.j.williams@xxxxxxxxx> Enable handle_stripe5 to pass off write operations to raid5_do_soft_blocks_ops (which can be run as a workqueue). The operations moved are reconstruct-writes and read-modify-writes formerly handled by compute_parity5. Changelog: * moved raid5_do_soft_block_ops changes into a separate patch * changed handle_write_operations5 to only initiate write operations, which prevents new writes from being requested while the current one is in flight * all blocks undergoing a write are now marked locked and !uptodate at the beginning of the write operation * blocks undergoing a read-modify-write need a request flag to distinguish them from blocks that are locked for reading. Reconstruct-writes still use the R5_LOCKED bit to select blocks for the operation * integrated the work queue Kconfig option Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- drivers/md/Kconfig | 21 +++++ drivers/md/raid5.c | 192 ++++++++++++++++++++++++++++++++++++++------ include/linux/raid/raid5.h | 3 + 3 files changed, 190 insertions(+), 26 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf869ed..2a16b3b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -162,6 +162,27 @@ config MD_RAID5_RESHAPE There should be enough spares already present to make the new array workable. +config MD_RAID456_WORKQUEUE + depends on MD_RAID456 + bool "Offload raid work to a workqueue from raid5d" + ---help--- + This option enables raid work (block copy and xor operations) + to run in a workqueue. If your platform has a high context + switch penalty say N. If you are using hardware offload or + are running on an SMP platform say Y. + + If unsure say, Y. + +config MD_RAID456_WORKQUEUE_MULTITHREAD + depends on MD_RAID456_WORKQUEUE && SMP + bool "Enable multi-threaded raid processing" + default y + ---help--- + This option controls whether the raid workqueue will be multi- + threaded or single threaded. + + If unsure say, Y. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8fde62b..e39d248 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -222,6 +222,8 @@ static void init_stripe(struct stripe_he BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + BUG_ON(sh->ops.state); + BUG_ON(sh->ops.pending); CHECK_DEVLOCK(); PRINTK("init_stripe called, stripe %llu\n", @@ -331,6 +333,9 @@ static int grow_one_stripe(raid5_conf_t memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + #ifdef CONFIG_MD_RAID456_WORKQUEUE + INIT_WORK(&sh->ops.work, conf->do_block_ops, sh); + #endif if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1266,7 +1271,72 @@ static void compute_block_2(struct strip } } +static int handle_write_operations5(struct stripe_head *sh, int rcw) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked=0; + + if (rcw == 0) { + /* skip the drain operation on an expand */ + if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) { + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state); + for (i=disks ; i-- ;) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + locked++; + } + } else { /* enter stage 1 of reconstruct write operation */ + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + } else { + /* enter stage 1 of read modify write operation */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + + set_bit(STRIPE_OP_RMW, &sh->state); + set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (i==pd_idx) + continue; + + /* For a read-modify write there may be blocks that are + * locked for reading while others are ready to be written + * so we distinguish these blocks by the RMWReq bit + */ + if (dev->towrite && + test_bit(R5_UPTODATE, &dev->flags)) { + set_bit(R5_RMWReq, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations + * are in flight + */ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + sh->ops.pending++; + PRINTK("%s: stripe %llu locked: %d op_state: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.state); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -1664,7 +1734,6 @@ static void raid5_do_soft_block_ops(void * schedule a write of some buffers * return confirmation of parity correctness * - * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * @@ -1679,13 +1748,13 @@ static void handle_stripe5(struct stripe int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; + int non_overwrite=0, write_complete=0; int failed_num=0; struct r5dev *dev; - PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", - (unsigned long long)sh->sector, atomic_read(&sh->count), - sh->pd_idx); + PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d\n", + (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), + sh->pd_idx); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -1926,8 +1995,56 @@ #endif set_bit(STRIPE_HANDLE, &sh->state); } - /* now to consider writing and what else, if anything should be read */ - if (to_write) { + /* Now we check to see if any write operations have recently + * completed + */ + if (test_bit(STRIPE_OP_RCW, &sh->state) && + test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_RCW, &sh->state); + clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state); + write_complete++; + } + + if (test_bit(STRIPE_OP_RMW, &sh->state) && + test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_RMW, &sh->state); + clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state); + BUG_ON(++write_complete > 1); + for (i=disks; i--;) + clear_bit(R5_RMWReq, &sh->dev[i].flags); + } + + /* All the 'written' buffers and the parity block are ready to be + * written back to disk + */ + if (write_complete) { + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + for (i=disks; i--;) { + dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || dev->written)) { + PRINTK("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags) + || (i==sh->pd_idx && failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } + + /* 1/ Now to consider new write requests and what else, if anything should be read + * 2/ Check operations clobber the parity block so do not start new writes while + * a check is in flight + * 3/ Write operations do not stack + */ + if (to_write && !test_bit(STRIPE_OP_RCW, &sh->state) && + !test_bit(STRIPE_OP_RMW, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->state)) { int rmw=0, rcw=0; for (i=disks ; i--;) { /* would I have to read this buffer for read_modify_write */ @@ -2000,25 +2117,8 @@ #endif } /* now if nothing is locked, and if we have enough data, we can start a write request */ if (locked == 0 && (rcw == 0 ||rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - PRINTK("Computing parity...\n"); - compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - /* now every locked buffer is ready to be written */ - for (i=disks; i--;) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - PRINTK("Writing block %d\n", i); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_bit(R5_Insync, &sh->dev[i].flags) - || (i==sh->pd_idx && failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } + !test_bit(STRIPE_BIT_DELAY, &sh->state)) + locked += handle_write_operations5(sh, rcw); } /* maybe we need to check and possibly fix the parity for this stripe @@ -2150,8 +2250,17 @@ #endif } } + queue_raid_work(sh); + spin_unlock(&sh->lock); + #ifndef CONFIG_MD_RAID456_WORKQUEUE + while (test_bit(STRIPE_OP_QUEUED, &sh->state)) { + PRINTK("run do_block_ops\n"); + conf->do_block_ops(sh); + } + #endif + while ((bi=return_bi)) { int bytes = bi->bi_size; @@ -3439,6 +3548,30 @@ static int run(mddev_t *mddev) if (!conf->spare_page) goto abort; } + + #ifdef CONFIG_MD_RAID456_WORKQUEUE + sprintf(conf->workqueue_name, "%s_raid5_ops", + mddev->gendisk->disk_name); + + #ifdef CONFIG_MD_RAID456_MULTITHREAD + if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name)) + == NULL) + goto abort; + #else + if ((conf->block_ops_queue = create_singlethread_workqueue( + conf->workqueue_name)) == NULL) + goto abort; + #endif + #endif + + /* To Do: + * 1/ Offload to asynchronous copy / xor engines + * 2/ Automated selection of optimal do_block_ops + * routine similar to the xor template selection + */ + conf->do_block_ops = raid5_do_soft_block_ops; + + spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -3598,6 +3731,10 @@ abort: safe_put_page(conf->spare_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); + #ifdef CONFIG_MD_RAID456_WORKQUEUE + if (conf->do_block_ops) + destroy_workqueue(conf->block_ops_queue); + #endif kfree(conf); } mddev->private = NULL; @@ -3618,6 +3755,9 @@ static int stop(mddev_t *mddev) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); kfree(conf->disks); + #ifdef CONFIG_MD_RAID456_WORKQUEUE + destroy_workqueue(conf->block_ops_queue); + #endif kfree(conf); mddev->private = NULL; return 0; diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index c8a315b..31ae55c 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -3,6 +3,7 @@ #define _RAID5_H #include <linux/raid/md.h> #include <linux/raid/xor.h> +#include <linux/workqueue.h> /* * @@ -333,6 +334,7 @@ struct raid5_private_data { atomic_t preread_active_stripes; /* stripes with scheduled io */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + #ifdef CONFIG_MD_RAID456_WORKQUEUE struct workqueue_struct *block_ops_queue; #endif @@ -376,6 +378,7 @@ struct raid5_private_data { typedef struct raid5_private_data raid5_conf_t; #define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + /* must be called under the stripe lock */ static inline void queue_raid_work(struct stripe_head *sh) { - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html