The stripe_queue object collects i/o requests before they are handled by the stripe-cache (via the stripe_head object). add_stripe_bio currently looks at the state of the stripe-cache to implement bitmap support, reimplement this using stripe_queue attributes. Introduce the STRIPE_QUEUE_FIRSTWRITE flag to track when a stripe is first written. When a stripe_head is available record the bitmap batch sequence number and set STRIPE_BIT_DELAY. For now a stripe_head will always be available at 'add_queue_bio' time, going forward the 'sh' field of the stripe_queue will indicate whether a stripe_head is attached. Tested-by: Mr. James W. Laferriere <babydr@xxxxxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- drivers/md/raid5.c | 53 ++++++++++++++++++++++++++++---------------- include/linux/raid/raid5.h | 6 +++++ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7bc206c..d566fc9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -31,8 +31,10 @@ * conf->bm_flush is the number of the last batch that was closed to * new additions. * When we discover that we will need to write to any block in a stripe - * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq - * the number of the batch it will be in. This is bm_flush+1. + * (in add_queue_bio) we update the in-memory bitmap and record in the + * stripe_queue that a bitmap write was started. Then, in handle_stripe when + * we have a stripe_head available, we update sh->bm_seq to record the + * sequence number (target batch number) of this request. This is bm_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, * we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current @@ -360,8 +362,14 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector } } while (sh == NULL); - if (sh) + if (sh) { atomic_inc(&sh->count); + if (test_and_clear_bit(STRIPE_QUEUE_FIRSTWRITE, + &sh->sq->state)) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + } spin_unlock_irq(&conf->device_lock); return sh; @@ -1991,26 +1999,34 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +static int add_queue_bio(struct stripe_queue *sq, struct bio *bi, int dd_idx, + int forwrite) { struct bio **bip; - struct stripe_queue *sq = sh->sq; raid5_conf_t *conf = sq->raid_conf; int firstwrite=0; - pr_debug("adding bh b#%llu to stripe s#%llu\n", + pr_debug("adding bio (%llu) to queue (%llu)\n", (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector); - + (unsigned long long)sq->sector); spin_lock(&sq->lock); spin_lock_irq(&conf->device_lock); if (forwrite) { bip = &sq->dev[dd_idx].towrite; - if (*bip == NULL && sq->dev[dd_idx].written == NULL) + set_bit(dd_idx, sq->to_write); + if (*bip == NULL && sq->dev[dd_idx].written == NULL) { + /* flag the queue to be assigned a bitmap + * sequence number + */ + set_bit(STRIPE_QUEUE_FIRSTWRITE, &sq->state); firstwrite = 1; - } else + } + } else { bip = &sq->dev[dd_idx].toread; + set_bit(dd_idx, sq->to_read); + } + while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) goto overlap; @@ -2024,19 +2040,17 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in bi->bi_next = *bip; *bip = bi; bi->bi_phys_segments ++; + spin_unlock_irq(&conf->device_lock); spin_unlock(&sq->lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector, dd_idx); + (unsigned long long)sq->sector, dd_idx); - if (conf->mddev->bitmap && firstwrite) { - bitmap_startwrite(conf->mddev->bitmap, sh->sector, + if (conf->mddev->bitmap && firstwrite) + bitmap_startwrite(conf->mddev->bitmap, sq->sector, STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } if (forwrite) { /* check if page is covered */ @@ -2049,7 +2063,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sector = bi->bi_sector + (bi->bi_size>>9); } if (sector >= sq->dev[dd_idx].sector + STRIPE_SECTORS) - set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); + set_bit(dd_idx, sq->overwrite); } return 1; @@ -3827,7 +3841,8 @@ static int make_request(struct request_queue *q, struct bio * bi) } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + !add_queue_bio(sh->sq, bi, dd_idx, + bi->bi_rw & RW_MASK)) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while @@ -4128,7 +4143,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) } set_bit(R5_ReadError, &sh->dev[dd_idx].flags); - if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + if (!add_queue_bio(sh->sq, raid_bio, dd_idx, 0)) { release_stripe(sh); raid_bio->bi_hw_segments = scnt; conf->retry_read_aligned = raid_bio; diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index fbe622c..3d4938c 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -218,6 +218,7 @@ struct stripe_queue { unsigned long *overlap; /* There is a pending overlapping request */ spinlock_t lock; /* protect bio lists and stripe_head state */ struct raid5_private_data *raid_conf; + unsigned long state; struct list_head list_node; int pd_idx; /* parity disk index */ int disks; /* disks in stripe */ @@ -288,6 +289,11 @@ struct stripe_queue { #define STRIPE_OP_MOD_DMA_CHECK 8 /* + * Stripe-queue state + */ +#define STRIPE_QUEUE_FIRSTWRITE 0 + +/* * Plugging: * * To improve write throughput, we need to delay the handling of some - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html