On Wed, 23 Jul 2014 15:47:23 +0800 Shaohua Li <shli@xxxxxxxxxx> wrote: > > stripe size is 4k default. Bigger stripe size is considered harmful, because if > IO size is small, big stripe size can cause a lot of unnecessary IO/parity > calculation. But if upper layer always sends full stripe write to RAID5 array, > this drawback goes away. And bigger stripe size can improve performance > actually in this case because of bigger size IO and less stripes to handle. In > my full stripe write test case, 16k stripe size can improve throughput 40% - > 120% depending on RAID5 configuration. > > V2: use order-0 page allocation Hi, using order-0 page allocations is a definite improvement, and the throughput improvements sound impressive. But I really don't like the idea of adding a configuration option. I'd much rather get rid of those than add new ones. I see your work as making it very clear that the current stripe cache is quite inefficient for some cases, and it is good to have that demonstrated. I don't think it is a useful fix though. We need to find a way to remove the overheads without using a "sledge hammer". Maybe adjacent stripe_heads can be linked together and processed as a unit? Thanks, NeilBrown > > Signed-off-by: Shaohua Li<shli@xxxxxxxxxxxx> > --- > drivers/md/raid5.c | 738 +++++++++++++++++++++++++++++++++++------------------ > drivers/md/raid5.h | 8 > 2 files changed, 502 insertions(+), 244 deletions(-) > > Index: linux/drivers/md/raid5.c > =================================================================== > --- linux.orig/drivers/md/raid5.c 2014-07-23 14:09:45.844570945 +0800 > +++ linux/drivers/md/raid5.c 2014-07-23 14:09:45.836571048 +0800 > @@ -70,9 +70,10 @@ static struct workqueue_struct *raid5_wq > */ > > #define NR_STRIPES 256 > -#define STRIPE_SIZE PAGE_SIZE > -#define STRIPE_SHIFT (PAGE_SHIFT - 9) > -#define STRIPE_SECTORS (STRIPE_SIZE>>9) > +#define STRIPE_SIZE(conf) (PAGE_SIZE << conf->stripe_size_order) > +#define STRIPE_SHIFT(conf) (PAGE_SHIFT - 9 + conf->stripe_size_order) > +#define STRIPE_SECTORS(conf) (STRIPE_SIZE(conf) >> 9) > +#define STRIPE_PAGES(conf) (1 << conf->stripe_size_order) > #define IO_THRESHOLD 1 > #define BYPASS_THRESHOLD 1 > #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) > @@ -81,13 +82,13 @@ static struct workqueue_struct *raid5_wq > > static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) > { > - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; > + int hash = (sect >> STRIPE_SHIFT(conf)) & HASH_MASK; > return &conf->stripe_hashtbl[hash]; > } > > -static inline int stripe_hash_locks_hash(sector_t sect) > +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) > { > - return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; > + return (sect >> STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; > } > > static inline void lock_device_hash_lock(struct r5conf *conf, int hash) > @@ -130,10 +131,10 @@ static inline void unlock_all_device_has > * This function is used to determine the 'next' bio in the list, given the sector > * of the current stripe+device > */ > -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) > +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) > { > int sectors = bio_sectors(bio); > - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) > + if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS(conf)) > return bio->bi_next; > else > return NULL; > @@ -483,36 +484,51 @@ out: > static void shrink_buffers(struct stripe_head *sh) > { > struct page *p; > - int i; > + int i, j; > int num = sh->raid_conf->pool_size; > > for (i = 0; i < num ; i++) { > - WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); > - p = sh->dev[i].page; > - if (!p) > - continue; > - sh->dev[i].page = NULL; > - put_page(p); > + for (j = 0; j < STRIPE_PAGES(sh->raid_conf); j++) { > + p = sh->dev[i].orig_pages[j]; > + if (!p) > + continue; > + WARN_ON(sh->dev[i].pages[j] != > + sh->dev[i].orig_pages[j]); > + put_page(p); > + sh->dev[i].pages[j] = NULL; > + sh->dev[i].orig_pages[j] = NULL; > + } > } > } > > static int grow_buffers(struct stripe_head *sh) > { > - int i; > + int i, j; > int num = sh->raid_conf->pool_size; > > for (i = 0; i < num; i++) { > struct page *page; > > - if (!(page = alloc_page(GFP_KERNEL))) { > - return 1; > + for (j = 0; j < STRIPE_PAGES(sh->raid_conf); j++) { > + page = alloc_page(GFP_KERNEL); > + if (!page) > + return 1; > + sh->dev[i].pages[j] = page; > + sh->dev[i].orig_pages[j] = page; > } > - sh->dev[i].page = page; > - sh->dev[i].orig_page = page; > } > return 0; > } > > +static void reset_stripe_devpage(struct stripe_head *sh, int i) > +{ > + struct r5conf *conf = sh->raid_conf; > + int j; > + > + for (j = 0; j < STRIPE_PAGES(conf); j++) > + sh->dev[i].pages[j] = sh->dev[i].orig_pages[j]; > +} > + > static void raid5_build_block(struct stripe_head *sh, int i, int previous); > static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, > struct stripe_head *sh); > @@ -659,7 +675,7 @@ get_active_stripe(struct r5conf *conf, s > int previous, int noblock, int noquiesce) > { > struct stripe_head *sh; > - int hash = stripe_hash_locks_hash(sector); > + int hash = stripe_hash_locks_hash(conf, sector); > > pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); > > @@ -740,7 +756,7 @@ raid5_end_write_request(struct bio *bi, > static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) > { > struct r5conf *conf = sh->raid_conf; > - int i, disks = sh->disks; > + int i, disks = sh->disks, j; > > might_sleep(); > > @@ -808,7 +824,7 @@ static void ops_run_io(struct stripe_hea > test_bit(WriteErrorSeen, &rdev->flags)) { > sector_t first_bad; > int bad_sectors; > - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, > + int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS(conf), > &first_bad, &bad_sectors); > if (!bad) > break; > @@ -840,7 +856,7 @@ static void ops_run_io(struct stripe_hea > if (rdev) { > if (s->syncing || s->expanding || s->expanded > || s->replacing) > - md_sync_acct(rdev->bdev, STRIPE_SECTORS); > + md_sync_acct(rdev->bdev, STRIPE_SECTORS(conf)); > > set_bit(STRIPE_IO_STARTED, &sh->state); > > @@ -867,11 +883,12 @@ static void ops_run_io(struct stripe_hea > > if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) > WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); > - sh->dev[i].vec.bv_page = sh->dev[i].page; > - bi->bi_vcnt = 1; > - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; > - bi->bi_io_vec[0].bv_offset = 0; > - bi->bi_iter.bi_size = STRIPE_SIZE; > + > + bi->bi_max_vecs = 1 << conf->stripe_size_order; > + bi->bi_io_vec = sh->dev[i].vecs; > + > + for (j = 0; j < STRIPE_PAGES(conf); j++) > + bio_add_page(bi, sh->dev[i].pages[j], PAGE_SIZE, 0); > /* > * If this is discard request, set bi_vcnt 0. We don't > * want to confuse SCSI because SCSI will replace payload > @@ -890,7 +907,7 @@ static void ops_run_io(struct stripe_hea > if (rrdev) { > if (s->syncing || s->expanding || s->expanded > || s->replacing) > - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); > + md_sync_acct(rrdev->bdev, STRIPE_SECTORS(conf)); > > set_bit(STRIPE_IO_STARTED, &sh->state); > > @@ -914,11 +931,12 @@ static void ops_run_io(struct stripe_hea > + rrdev->data_offset); > if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) > WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); > - sh->dev[i].rvec.bv_page = sh->dev[i].page; > - rbi->bi_vcnt = 1; > - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; > - rbi->bi_io_vec[0].bv_offset = 0; > - rbi->bi_iter.bi_size = STRIPE_SIZE; > + > + rbi->bi_max_vecs = 1 << conf->stripe_size_order; > + rbi->bi_io_vec = sh->dev[i].rvecs; > + > + for (j = 0; j < STRIPE_PAGES(conf); j++) > + bio_add_page(rbi, sh->dev[i].pages[j], PAGE_SIZE, 0); > /* > * If this is discard request, set bi_vcnt 0. We don't > * want to confuse SCSI because SCSI will replace payload > @@ -943,7 +961,7 @@ static void ops_run_io(struct stripe_hea > } > > static struct dma_async_tx_descriptor * > -async_copy_data(int frombio, struct bio *bio, struct page **page, > +async_copy_one_page(int frombio, struct bio *bio, struct page **page, > sector_t sector, struct dma_async_tx_descriptor *tx, > struct stripe_head *sh) > { > @@ -974,8 +992,8 @@ async_copy_data(int frombio, struct bio > len -= b_offset; > } > > - if (len > 0 && page_offset + len > STRIPE_SIZE) > - clen = STRIPE_SIZE - page_offset; > + if (len > 0 && page_offset + len > PAGE_SIZE) > + clen = PAGE_SIZE - page_offset; > else > clen = len; > > @@ -985,7 +1003,7 @@ async_copy_data(int frombio, struct bio > if (frombio) { > if (sh->raid_conf->skip_copy && > b_offset == 0 && page_offset == 0 && > - clen == STRIPE_SIZE) > + clen == PAGE_SIZE) > *page = bio_page; > else > tx = async_memcpy(*page, bio_page, page_offset, > @@ -997,14 +1015,42 @@ async_copy_data(int frombio, struct bio > /* chain the operations */ > submit.depend_tx = tx; > > - if (clen < len) /* hit end of page */ > - break; > page_offset += len; > + /* hit end of page */ > + if (page_offset > 0 && (page_offset % PAGE_SIZE) == 0) > + break; > } > > return tx; > } > > +static struct dma_async_tx_descriptor * > +async_copy_data(int frombio, struct bio *bio, struct page **pages, > + sector_t sector, struct dma_async_tx_descriptor *tx, > + struct stripe_head *sh, int *skip_copy) > +{ > + sector_t offset; > + struct page **cur_page, *tmp; > + > + *skip_copy = 0; > + if (sector > bio->bi_iter.bi_sector) > + offset = sector; > + else { > + offset = bio->bi_iter.bi_sector >> 3; > + offset <<= 3; > + } > + while (offset < bio_end_sector(bio) && > + offset < sector + STRIPE_SECTORS(sh->raid_conf)) { > + cur_page = &pages[(offset - sector) >> 3]; > + tmp = *cur_page; > + tx = async_copy_one_page(frombio, bio, cur_page, offset, tx, sh); > + if (tmp != *cur_page) > + *skip_copy = 1; > + offset += PAGE_SIZE >> 9; > + } > + return tx; > +} > + > static void ops_complete_biofill(void *stripe_head_ref) > { > struct stripe_head *sh = stripe_head_ref; > @@ -1030,8 +1076,8 @@ static void ops_complete_biofill(void *s > rbi = dev->read; > dev->read = NULL; > while (rbi && rbi->bi_iter.bi_sector < > - dev->sector + STRIPE_SECTORS) { > - rbi2 = r5_next_bio(rbi, dev->sector); > + dev->sector + STRIPE_SECTORS(sh->raid_conf)) { > + rbi2 = r5_next_bio(sh->raid_conf, rbi, dev->sector); > if (!raid5_dec_bi_active_stripes(rbi)) { > rbi->bi_next = return_bi; > return_bi = rbi; > @@ -1052,7 +1098,7 @@ static void ops_run_biofill(struct strip > { > struct dma_async_tx_descriptor *tx = NULL; > struct async_submit_ctl submit; > - int i; > + int i, dummy; > > pr_debug("%s: stripe %llu\n", __func__, > (unsigned long long)sh->sector); > @@ -1066,10 +1112,10 @@ static void ops_run_biofill(struct strip > dev->toread = NULL; > spin_unlock_irq(&sh->stripe_lock); > while (rbi && rbi->bi_iter.bi_sector < > - dev->sector + STRIPE_SECTORS) { > - tx = async_copy_data(0, rbi, &dev->page, > - dev->sector, tx, sh); > - rbi = r5_next_bio(rbi, dev->sector); > + dev->sector + STRIPE_SECTORS(sh->raid_conf)) { > + tx = async_copy_data(0, rbi, dev->pages, > + dev->sector, tx, sh, &dummy); > + rbi = r5_next_bio(sh->raid_conf, rbi, dev->sector); > } > } > } > @@ -1112,40 +1158,64 @@ static void ops_complete_compute(void *s > > /* return a pointer to the address conversion region of the scribble buffer */ > static addr_conv_t *to_addr_conv(struct stripe_head *sh, > - struct raid5_percpu *percpu) > + struct raid5_percpu *percpu, int page_index) > +{ > + > + return percpu->scribble + sizeof(struct page *) * (sh->disks + 2) + > + page_index * (sh->raid_conf->scribble_len / > + STRIPE_PAGES(sh->raid_conf)); > +} > + > +static struct page **to_scribble_page(struct stripe_head *sh, > + struct raid5_percpu *percpu, int page_index) > { > - return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); > + return percpu->scribble + page_index * (sh->raid_conf->scribble_len / > + STRIPE_PAGES(sh->raid_conf)); > } > > static struct dma_async_tx_descriptor * > ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) > { > int disks = sh->disks; > - struct page **xor_srcs = percpu->scribble; > + struct page **xor_srcs; > int target = sh->ops.target; > struct r5dev *tgt = &sh->dev[target]; > - struct page *xor_dest = tgt->page; > - int count = 0; > - struct dma_async_tx_descriptor *tx; > + struct page *xor_dest; > + int count; > + struct dma_async_tx_descriptor *tx = NULL; > struct async_submit_ctl submit; > - int i; > + int i, j = 0; > > pr_debug("%s: stripe %llu block: %d\n", > __func__, (unsigned long long)sh->sector, target); > BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); > > +again: > + count = 0; > + xor_srcs = to_scribble_page(sh, percpu, j); > + xor_dest = tgt->pages[j]; > + > for (i = disks; i--; ) > if (i != target) > - xor_srcs[count++] = sh->dev[i].page; > + xor_srcs[count++] = sh->dev[i].pages[j]; > > - atomic_inc(&sh->count); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) { > + atomic_inc(&sh->count); > + > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, > + ops_complete_compute, sh, to_addr_conv(sh, percpu, j)); > + } else > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, > + NULL, NULL, to_addr_conv(sh, percpu, j)); > > - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, > - ops_complete_compute, sh, to_addr_conv(sh, percpu)); > if (unlikely(count == 1)) > - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); > + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, PAGE_SIZE, &submit); > else > - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); > + tx = async_xor(xor_dest, xor_srcs, 0, count, PAGE_SIZE, &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > > return tx; > } > @@ -1159,7 +1229,8 @@ ops_run_compute5(struct stripe_head *sh, > * destination buffer is recorded in srcs[count] and the Q destination > * is recorded in srcs[count+1]]. > */ > -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) > +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh, > + int page_index) > { > int disks = sh->disks; > int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); > @@ -1175,7 +1246,7 @@ static int set_syndrome_sources(struct p > do { > int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); > > - srcs[slot] = sh->dev[i].page; > + srcs[slot] = sh->dev[i].pages[page_index]; > i = raid6_next_disk(i, disks); > } while (i != d0_idx); > > @@ -1186,14 +1257,14 @@ static struct dma_async_tx_descriptor * > ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) > { > int disks = sh->disks; > - struct page **blocks = percpu->scribble; > + struct page **blocks; > int target; > int qd_idx = sh->qd_idx; > - struct dma_async_tx_descriptor *tx; > + struct dma_async_tx_descriptor *tx = NULL; > struct async_submit_ctl submit; > struct r5dev *tgt; > struct page *dest; > - int i; > + int i, j = 0; > int count; > > if (sh->ops.target < 0) > @@ -1209,40 +1280,57 @@ ops_run_compute6_1(struct stripe_head *s > > tgt = &sh->dev[target]; > BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); > - dest = tgt->page; > > - atomic_inc(&sh->count); > +again: > + dest = tgt->pages[j]; > + blocks = to_scribble_page(sh, percpu, j); > + > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + atomic_inc(&sh->count); > > if (target == qd_idx) { > - count = set_syndrome_sources(blocks, sh); > + count = set_syndrome_sources(blocks, sh, j); > blocks[count] = NULL; /* regenerating p is not necessary */ > BUG_ON(blocks[count+1] != dest); /* q should already be set */ > - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, > - ops_complete_compute, sh, > - to_addr_conv(sh, percpu)); > - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); > + > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + ops_complete_compute, sh, > + to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + NULL, NULL, to_addr_conv(sh, percpu, j)); > + tx = async_gen_syndrome(blocks, 0, count+2, PAGE_SIZE, &submit); > } else { > /* Compute any data- or p-drive using XOR */ > count = 0; > for (i = disks; i-- ; ) { > if (i == target || i == qd_idx) > continue; > - blocks[count++] = sh->dev[i].page; > + blocks[count++] = sh->dev[i].pages[j]; > } > > - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, > - NULL, ops_complete_compute, sh, > - to_addr_conv(sh, percpu)); > - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, > + tx, ops_complete_compute, sh, > + to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, > + tx, NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + tx = async_xor(dest, blocks, 0, count, PAGE_SIZE, &submit); > } > > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > return tx; > } > > static struct dma_async_tx_descriptor * > ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) > { > - int i, count, disks = sh->disks; > + int i, count, disks = sh->disks, j = 0; > int syndrome_disks = sh->ddf_layout ? disks : disks-2; > int d0_idx = raid6_d0(sh); > int faila = -1, failb = -1; > @@ -1250,8 +1338,8 @@ ops_run_compute6_2(struct stripe_head *s > int target2 = sh->ops.target2; > struct r5dev *tgt = &sh->dev[target]; > struct r5dev *tgt2 = &sh->dev[target2]; > - struct dma_async_tx_descriptor *tx; > - struct page **blocks = percpu->scribble; > + struct dma_async_tx_descriptor *tx = NULL; > + struct page **blocks; > struct async_submit_ctl submit; > > pr_debug("%s: stripe %llu block1: %d block2: %d\n", > @@ -1260,6 +1348,8 @@ ops_run_compute6_2(struct stripe_head *s > BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); > BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); > > +again: > + blocks = to_scribble_page(sh, percpu, j); > /* we need to open-code set_syndrome_sources to handle the > * slot number conversion for 'faila' and 'failb' > */ > @@ -1270,7 +1360,7 @@ ops_run_compute6_2(struct stripe_head *s > do { > int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); > > - blocks[slot] = sh->dev[i].page; > + blocks[slot] = sh->dev[i].pages[j]; > > if (i == target) > faila = slot; > @@ -1285,17 +1375,23 @@ ops_run_compute6_2(struct stripe_head *s > pr_debug("%s: stripe: %llu faila: %d failb: %d\n", > __func__, (unsigned long long)sh->sector, faila, failb); > > - atomic_inc(&sh->count); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + atomic_inc(&sh->count); > > if (failb == syndrome_disks+1) { > /* Q disk is one of the missing disks */ > if (faila == syndrome_disks) { > /* Missing P+Q, just recompute */ > - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > ops_complete_compute, sh, > - to_addr_conv(sh, percpu)); > - return async_gen_syndrome(blocks, 0, syndrome_disks+2, > - STRIPE_SIZE, &submit); > + to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + tx = async_gen_syndrome(blocks, 0, syndrome_disks+2, > + PAGE_SIZE, &submit); > } else { > struct page *dest; > int data_target; > @@ -1311,39 +1407,55 @@ ops_run_compute6_2(struct stripe_head *s > for (i = disks; i-- ; ) { > if (i == data_target || i == qd_idx) > continue; > - blocks[count++] = sh->dev[i].page; > + blocks[count++] = sh->dev[i].pages[j]; > } > - dest = sh->dev[data_target].page; > + dest = sh->dev[data_target].pages[j]; > init_async_submit(&submit, > ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, > - NULL, NULL, NULL, > - to_addr_conv(sh, percpu)); > - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, > + tx, NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + tx = async_xor(dest, blocks, 0, count, PAGE_SIZE, > &submit); > > - count = set_syndrome_sources(blocks, sh); > - init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + count = set_syndrome_sources(blocks, sh, j); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > ops_complete_compute, sh, > - to_addr_conv(sh, percpu)); > - return async_gen_syndrome(blocks, 0, count+2, > - STRIPE_SIZE, &submit); > + to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + tx = async_gen_syndrome(blocks, 0, count+2, > + PAGE_SIZE, &submit); > } > } else { > - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > ops_complete_compute, sh, > - to_addr_conv(sh, percpu)); > + to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE, tx, > + NULL, NULL, to_addr_conv(sh, percpu, j)); > + > if (failb == syndrome_disks) { > /* We're missing D+P. */ > - return async_raid6_datap_recov(syndrome_disks+2, > - STRIPE_SIZE, faila, > + tx = async_raid6_datap_recov(syndrome_disks+2, > + PAGE_SIZE, faila, > blocks, &submit); > } else { > /* We're missing D+D. */ > - return async_raid6_2data_recov(syndrome_disks+2, > - STRIPE_SIZE, faila, failb, > + tx = async_raid6_2data_recov(syndrome_disks+2, > + PAGE_SIZE, faila, failb, > blocks, &submit); > } > } > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > + > + return tx; > } > > > @@ -1360,26 +1472,40 @@ ops_run_prexor(struct stripe_head *sh, s > struct dma_async_tx_descriptor *tx) > { > int disks = sh->disks; > - struct page **xor_srcs = percpu->scribble; > - int count = 0, pd_idx = sh->pd_idx, i; > + struct page **xor_srcs; > + int count, pd_idx = sh->pd_idx, i, j = 0; > struct async_submit_ctl submit; > > /* existing parity data subtracted */ > - struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; > + struct page *xor_dest; > > pr_debug("%s: stripe %llu\n", __func__, > (unsigned long long)sh->sector); > > +again: > + count = 0; > + xor_srcs = to_scribble_page(sh, percpu, j); > + /* existing parity data subtracted */ > + xor_dest = xor_srcs[count++] = sh->dev[pd_idx].pages[j]; > + > for (i = disks; i--; ) { > struct r5dev *dev = &sh->dev[i]; > /* Only process blocks that are known to be uptodate */ > if (test_bit(R5_Wantdrain, &dev->flags)) > - xor_srcs[count++] = dev->page; > + xor_srcs[count++] = dev->pages[j]; > } > > - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, > - ops_complete_prexor, sh, to_addr_conv(sh, percpu)); > - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, > + ops_complete_prexor, sh, to_addr_conv(sh, percpu, j)); > + else > + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, > + NULL, NULL, to_addr_conv(sh, percpu, j)); > + tx = async_xor(xor_dest, xor_srcs, 0, count, PAGE_SIZE, &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > > return tx; > } > @@ -1406,10 +1532,10 @@ ops_run_biodrain(struct stripe_head *sh, > BUG_ON(dev->written); > wbi = dev->written = chosen; > spin_unlock_irq(&sh->stripe_lock); > - WARN_ON(dev->page != dev->orig_page); > + WARN_ON(dev->pages[0] != dev->orig_pages[0]); > > while (wbi && wbi->bi_iter.bi_sector < > - dev->sector + STRIPE_SECTORS) { > + dev->sector + STRIPE_SECTORS(sh->raid_conf)) { > if (wbi->bi_rw & REQ_FUA) > set_bit(R5_WantFUA, &dev->flags); > if (wbi->bi_rw & REQ_SYNC) > @@ -1417,15 +1543,16 @@ ops_run_biodrain(struct stripe_head *sh, > if (wbi->bi_rw & REQ_DISCARD) > set_bit(R5_Discard, &dev->flags); > else { > - tx = async_copy_data(1, wbi, &dev->page, > - dev->sector, tx, sh); > - if (dev->page != dev->orig_page) { > + int skip_copy; > + tx = async_copy_data(1, wbi, dev->pages, > + dev->sector, tx, sh, &skip_copy); > + if (skip_copy) { > set_bit(R5_SkipCopy, &dev->flags); > clear_bit(R5_UPTODATE, &dev->flags); > clear_bit(R5_OVERWRITE, &dev->flags); > } > } > - wbi = r5_next_bio(wbi, dev->sector); > + wbi = r5_next_bio(sh->raid_conf, wbi, dev->sector); > } > } > } > @@ -1482,9 +1609,9 @@ ops_run_reconstruct5(struct stripe_head > struct dma_async_tx_descriptor *tx) > { > int disks = sh->disks; > - struct page **xor_srcs = percpu->scribble; > + struct page **xor_srcs; > struct async_submit_ctl submit; > - int count = 0, pd_idx = sh->pd_idx, i; > + int count, pd_idx = sh->pd_idx, i, j = 0; > struct page *xor_dest; > int prexor = 0; > unsigned long flags; > @@ -1504,23 +1631,27 @@ ops_run_reconstruct5(struct stripe_head > ops_complete_reconstruct(sh); > return; > } > + > +again: > + count = 0; > + xor_srcs = to_scribble_page(sh, percpu, j); > /* check if prexor is active which means only process blocks > * that are part of a read-modify-write (written) > */ > if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { > prexor = 1; > - xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; > + xor_dest = xor_srcs[count++] = sh->dev[pd_idx].pages[j]; > for (i = disks; i--; ) { > struct r5dev *dev = &sh->dev[i]; > if (dev->written) > - xor_srcs[count++] = dev->page; > + xor_srcs[count++] = dev->pages[j]; > } > } else { > - xor_dest = sh->dev[pd_idx].page; > + xor_dest = sh->dev[pd_idx].pages[j]; > for (i = disks; i--; ) { > struct r5dev *dev = &sh->dev[i]; > if (i != pd_idx) > - xor_srcs[count++] = dev->page; > + xor_srcs[count++] = dev->pages[j]; > } > } > > @@ -1529,17 +1660,28 @@ ops_run_reconstruct5(struct stripe_head > * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST > * for the synchronous xor case > */ > - flags = ASYNC_TX_ACK | > - (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) { > + flags = ASYNC_TX_ACK | > + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); > > - atomic_inc(&sh->count); > + atomic_inc(&sh->count); > + > + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, > + to_addr_conv(sh, percpu, j)); > + } else { > + flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; > + init_async_submit(&submit, flags, tx, NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + } > > - init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, > - to_addr_conv(sh, percpu)); > if (unlikely(count == 1)) > - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); > + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, PAGE_SIZE, &submit); > else > - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); > + tx = async_xor(xor_dest, xor_srcs, 0, count, PAGE_SIZE, &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > } > > static void > @@ -1547,8 +1689,8 @@ ops_run_reconstruct6(struct stripe_head > struct dma_async_tx_descriptor *tx) > { > struct async_submit_ctl submit; > - struct page **blocks = percpu->scribble; > - int count, i; > + struct page **blocks; > + int count, i, j = 0; > > pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); > > @@ -1566,22 +1708,38 @@ ops_run_reconstruct6(struct stripe_head > return; > } > > - count = set_syndrome_sources(blocks, sh); > +again: > + blocks = to_scribble_page(sh, percpu, j); > > - atomic_inc(&sh->count); > + count = set_syndrome_sources(blocks, sh, j); > + > + if (j == STRIPE_PAGES(sh->raid_conf) - 1) { > + atomic_inc(&sh->count); > > - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, > - sh, to_addr_conv(sh, percpu)); > - async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); > + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, > + sh, to_addr_conv(sh, percpu, j)); > + } else > + init_async_submit(&submit, 0, tx, NULL, > + NULL, to_addr_conv(sh, percpu, j)); > + tx = async_gen_syndrome(blocks, 0, count+2, PAGE_SIZE, &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > } > > static void ops_complete_check(void *stripe_head_ref) > { > struct stripe_head *sh = stripe_head_ref; > + int i; > > pr_debug("%s: stripe %llu\n", __func__, > (unsigned long long)sh->sector); > > + sh->ops.zero_sum_result = 0; > + for (i = 0; i < STRIPE_PAGES(sh->raid_conf); i++) > + sh->ops.zero_sum_result |= sh->ops.sum_results[i]; > + > sh->check_state = check_state_check_result; > set_bit(STRIPE_HANDLE, &sh->state); > release_stripe(sh); > @@ -1593,28 +1751,34 @@ static void ops_run_check_p(struct strip > int pd_idx = sh->pd_idx; > int qd_idx = sh->qd_idx; > struct page *xor_dest; > - struct page **xor_srcs = percpu->scribble; > - struct dma_async_tx_descriptor *tx; > + struct page **xor_srcs; > + struct dma_async_tx_descriptor *tx = NULL; > struct async_submit_ctl submit; > int count; > - int i; > + int i, j = 0; > > pr_debug("%s: stripe %llu\n", __func__, > (unsigned long long)sh->sector); > > +again: > + xor_srcs = to_scribble_page(sh, percpu, j); > count = 0; > - xor_dest = sh->dev[pd_idx].page; > + xor_dest = sh->dev[pd_idx].pages[j]; > xor_srcs[count++] = xor_dest; > for (i = disks; i--; ) { > if (i == pd_idx || i == qd_idx) > continue; > - xor_srcs[count++] = sh->dev[i].page; > + xor_srcs[count++] = sh->dev[i].pages[j]; > } > > - init_async_submit(&submit, 0, NULL, NULL, NULL, > - to_addr_conv(sh, percpu)); > - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, > - &sh->ops.zero_sum_result, &submit); > + init_async_submit(&submit, 0, tx, NULL, NULL, > + to_addr_conv(sh, percpu, j)); > + tx = async_xor_val(xor_dest, xor_srcs, 0, count, PAGE_SIZE, > + &sh->ops.sum_results[j], &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > > atomic_inc(&sh->count); > init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); > @@ -1623,22 +1787,32 @@ static void ops_run_check_p(struct strip > > static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) > { > - struct page **srcs = percpu->scribble; > + struct page **srcs; > struct async_submit_ctl submit; > - int count; > + int count, j = 0; > + struct dma_async_tx_descriptor *tx = NULL; > > pr_debug("%s: stripe %llu checkp: %d\n", __func__, > (unsigned long long)sh->sector, checkp); > > - count = set_syndrome_sources(srcs, sh); > +again: > + srcs = to_scribble_page(sh, percpu, j); > + count = set_syndrome_sources(srcs, sh, j); > if (!checkp) > srcs[count] = NULL; > > - atomic_inc(&sh->count); > - init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, > - sh, to_addr_conv(sh, percpu)); > - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, > - &sh->ops.zero_sum_result, percpu->spare_page, &submit); > + init_async_submit(&submit, 0, tx, NULL, > + NULL, to_addr_conv(sh, percpu, j)); > + async_syndrome_val(srcs, 0, count+2, PAGE_SIZE, > + &sh->ops.sum_results[j], percpu->spare_pages[j], &submit); > + > + j++; > + if (j < STRIPE_PAGES(sh->raid_conf)) > + goto again; > + > + atomic_inc(&sh->count); > + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); > + tx = async_trigger_callback(&submit); > } > > static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) > @@ -1706,6 +1880,37 @@ static void raid_run_ops(struct stripe_h > put_cpu(); > } > > +#define STRIPE_ALLOC_SIZE(conf, devs) \ > + (sizeof(struct stripe_head) + (devs - 1) * sizeof(struct r5dev) + \ > + sizeof(enum sum_check_flags) * STRIPE_PAGES(conf) + \ > + sizeof(struct bio_vec) * devs * STRIPE_PAGES(conf) * 2 + \ > + sizeof(struct page *) * devs * STRIPE_PAGES(conf) * 2) > + > +static void init_stripe_pointer(struct r5conf *conf, struct stripe_head *sh, int devs) > +{ > + void *p = sh; > + struct bio_vec *vecs, *rvecs; > + struct page **pages, **orig_pages; > + int i; > + > + p += sizeof(struct stripe_head) + (devs - 1) * sizeof(struct r5dev); > + sh->ops.sum_results = p; > + p += sizeof(enum sum_check_flags) * STRIPE_PAGES(conf); > + vecs = p; > + p += sizeof(struct bio_vec) * devs * STRIPE_PAGES(conf); > + rvecs = p; > + p += sizeof(struct bio_vec) * devs * STRIPE_PAGES(conf); > + pages = p; > + p += sizeof(struct page *) * devs * STRIPE_PAGES(conf); > + orig_pages = p; > + for (i = 0; i < devs; i++) { > + sh->dev[i].vecs = vecs + i * STRIPE_PAGES(conf); > + sh->dev[i].rvecs = rvecs + i * STRIPE_PAGES(conf); > + sh->dev[i].pages = pages + i * STRIPE_PAGES(conf); > + sh->dev[i].orig_pages = orig_pages + i * STRIPE_PAGES(conf); > + } > +} > + > static int grow_one_stripe(struct r5conf *conf, int hash) > { > struct stripe_head *sh; > @@ -1713,6 +1918,7 @@ static int grow_one_stripe(struct r5conf > if (!sh) > return 0; > > + init_stripe_pointer(conf, sh, conf->pool_size); > sh->raid_conf = conf; > > spin_lock_init(&sh->stripe_lock); > @@ -1747,7 +1953,7 @@ static int grow_stripes(struct r5conf *c > > conf->active_name = 0; > sc = kmem_cache_create(conf->cache_name[conf->active_name], > - sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), > + STRIPE_ALLOC_SIZE(conf, devs), > 0, 0, NULL); > if (!sc) > return 1; > @@ -1776,11 +1982,12 @@ static int grow_stripes(struct r5conf *c > * calculate over all devices (not just the data blocks), using zeros in place > * of the P and Q blocks. > */ > -static size_t scribble_len(int num) > +static size_t scribble_len(struct r5conf *conf, int num) > { > size_t len; > > len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); > + len *= STRIPE_PAGES(conf); > > return len; > } > @@ -1816,7 +2023,7 @@ static int resize_stripes(struct r5conf > unsigned long cpu; > int err; > struct kmem_cache *sc; > - int i; > + int i, j; > int hash, cnt; > > if (newsize <= conf->pool_size) > @@ -1828,7 +2035,7 @@ static int resize_stripes(struct r5conf > > /* Step 1 */ > sc = kmem_cache_create(conf->cache_name[1-conf->active_name], > - sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), > + STRIPE_ALLOC_SIZE(conf, newsize), > 0, 0, NULL); > if (!sc) > return -ENOMEM; > @@ -1838,6 +2045,8 @@ static int resize_stripes(struct r5conf > if (!nsh) > break; > > + init_stripe_pointer(conf, nsh, newsize); > + > nsh->raid_conf = conf; > spin_lock_init(&nsh->stripe_lock); > > @@ -1869,11 +2078,17 @@ static int resize_stripes(struct r5conf > unlock_device_hash_lock(conf, hash); > atomic_set(&nsh->count, 1); > for(i=0; i<conf->pool_size; i++) { > - nsh->dev[i].page = osh->dev[i].page; > - nsh->dev[i].orig_page = osh->dev[i].page; > + for (j = 0; j < STRIPE_PAGES(conf); j++) { > + nsh->dev[i].pages[j] = osh->dev[i].pages[j]; > + nsh->dev[i].orig_pages[j] = osh->dev[i].orig_pages[j]; > + } > + } > + for( ; i < newsize; i++) { > + for (j = 0; j < STRIPE_PAGES(conf); j++) { > + nsh->dev[i].pages[j] = NULL; > + nsh->dev[i].orig_pages[j] = NULL; > + } > } > - for( ; i<newsize; i++) > - nsh->dev[i].page = NULL; > nsh->hash_lock_index = hash; > kmem_cache_free(conf->slab_cache, osh); > cnt++; > @@ -1900,7 +2115,7 @@ static int resize_stripes(struct r5conf > err = -ENOMEM; > > get_online_cpus(); > - conf->scribble_len = scribble_len(newsize); > + conf->scribble_len = scribble_len(conf, newsize); > for_each_present_cpu(cpu) { > struct raid5_percpu *percpu; > void *scribble; > @@ -1923,14 +2138,21 @@ static int resize_stripes(struct r5conf > nsh = list_entry(newstripes.next, struct stripe_head, lru); > list_del_init(&nsh->lru); > > - for (i=conf->raid_disks; i < newsize; i++) > - if (nsh->dev[i].page == NULL) { > - struct page *p = alloc_page(GFP_NOIO); > - nsh->dev[i].page = p; > - nsh->dev[i].orig_page = p; > - if (!p) > + for (i=conf->raid_disks; i < newsize; i++) { > + for (j = 0; j < STRIPE_PAGES(conf); j++) { > + struct page *p; > + if (nsh->dev[i].orig_pages[j]) > + continue; > + > + p = alloc_page(GFP_NOIO); > + if (!p) { > err = -ENOMEM; > + continue; > + } > + nsh->dev[i].orig_pages[j] = p; > + nsh->dev[i].pages[j] = p; > } > + } > release_stripe(nsh); > } > /* critical section pass, GFP_NOIO no longer needed */ > @@ -2015,10 +2237,10 @@ static void raid5_end_read_request(struc > KERN_INFO > "md/raid:%s: read error corrected" > " (%lu sectors at %llu on %s)\n", > - mdname(conf->mddev), STRIPE_SECTORS, > + mdname(conf->mddev), STRIPE_SECTORS(conf), > (unsigned long long)s, > bdevname(rdev->bdev, b)); > - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); > + atomic_add(STRIPE_SECTORS(conf), &rdev->corrected_errors); > clear_bit(R5_ReadError, &sh->dev[i].flags); > clear_bit(R5_ReWrite, &sh->dev[i].flags); > } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) > @@ -2082,7 +2304,7 @@ static void raid5_end_read_request(struc > if (!(set_bad > && test_bit(In_sync, &rdev->flags) > && rdev_set_badblocks( > - rdev, sh->sector, STRIPE_SECTORS, 0))) > + rdev, sh->sector, STRIPE_SECTORS(conf), 0))) > md_error(conf->mddev, rdev); > } > } > @@ -2133,7 +2355,7 @@ static void raid5_end_write_request(stru > if (!uptodate) > md_error(conf->mddev, rdev); > else if (is_badblock(rdev, sh->sector, > - STRIPE_SECTORS, > + STRIPE_SECTORS(conf), > &first_bad, &bad_sectors)) > set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); > } else { > @@ -2145,7 +2367,7 @@ static void raid5_end_write_request(stru > set_bit(MD_RECOVERY_NEEDED, > &rdev->mddev->recovery); > } else if (is_badblock(rdev, sh->sector, > - STRIPE_SECTORS, > + STRIPE_SECTORS(conf), > &first_bad, &bad_sectors)) { > set_bit(R5_MadeGood, &sh->dev[i].flags); > if (test_bit(R5_ReadError, &sh->dev[i].flags)) > @@ -2171,13 +2393,9 @@ static void raid5_build_block(struct str > struct r5dev *dev = &sh->dev[i]; > > bio_init(&dev->req); > - dev->req.bi_io_vec = &dev->vec; > - dev->req.bi_max_vecs = 1; > dev->req.bi_private = sh; > > bio_init(&dev->rreq); > - dev->rreq.bi_io_vec = &dev->rvec; > - dev->rreq.bi_max_vecs = 1; > dev->rreq.bi_private = sh; > > dev->flags = 0; > @@ -2674,13 +2892,13 @@ static int add_stripe_bio(struct stripe_ > /* check if page is covered */ > sector_t sector = sh->dev[dd_idx].sector; > for (bi=sh->dev[dd_idx].towrite; > - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && > + sector < sh->dev[dd_idx].sector + STRIPE_SECTORS(conf) && > bi && bi->bi_iter.bi_sector <= sector; > - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { > + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { > if (bio_end_sector(bi) >= sector) > sector = bio_end_sector(bi); > } > - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) > + if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS(conf)) > set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); > } > > @@ -2691,7 +2909,7 @@ static int add_stripe_bio(struct stripe_ > > if (conf->mddev->bitmap && firstwrite) { > bitmap_startwrite(conf->mddev->bitmap, sh->sector, > - STRIPE_SECTORS, 0); > + STRIPE_SECTORS(conf), 0); > sh->bm_seq = conf->seq_flush+1; > set_bit(STRIPE_BIT_DELAY, &sh->state); > } > @@ -2744,7 +2962,7 @@ handle_failed_stripe(struct r5conf *conf > if (!rdev_set_badblocks( > rdev, > sh->sector, > - STRIPE_SECTORS, 0)) > + STRIPE_SECTORS(conf), 0)) > md_error(conf->mddev, rdev); > rdev_dec_pending(rdev, conf->mddev); > } > @@ -2761,8 +2979,8 @@ handle_failed_stripe(struct r5conf *conf > wake_up(&conf->wait_for_overlap); > > while (bi && bi->bi_iter.bi_sector < > - sh->dev[i].sector + STRIPE_SECTORS) { > - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); > + sh->dev[i].sector + STRIPE_SECTORS(conf)) { > + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); > clear_bit(BIO_UPTODATE, &bi->bi_flags); > if (!raid5_dec_bi_active_stripes(bi)) { > md_write_end(conf->mddev); > @@ -2773,20 +2991,20 @@ handle_failed_stripe(struct r5conf *conf > } > if (bitmap_end) > bitmap_endwrite(conf->mddev->bitmap, sh->sector, > - STRIPE_SECTORS, 0, 0); > + STRIPE_SECTORS(conf), 0, 0); > bitmap_end = 0; > /* and fail all 'written' */ > bi = sh->dev[i].written; > sh->dev[i].written = NULL; > if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { > WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); > - sh->dev[i].page = sh->dev[i].orig_page; > + reset_stripe_devpage(sh, i); > } > > if (bi) bitmap_end = 1; > while (bi && bi->bi_iter.bi_sector < > - sh->dev[i].sector + STRIPE_SECTORS) { > - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); > + sh->dev[i].sector + STRIPE_SECTORS(conf)) { > + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); > clear_bit(BIO_UPTODATE, &bi->bi_flags); > if (!raid5_dec_bi_active_stripes(bi)) { > md_write_end(conf->mddev); > @@ -2809,9 +3027,9 @@ handle_failed_stripe(struct r5conf *conf > if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) > wake_up(&conf->wait_for_overlap); > while (bi && bi->bi_iter.bi_sector < > - sh->dev[i].sector + STRIPE_SECTORS) { > + sh->dev[i].sector + STRIPE_SECTORS(conf)) { > struct bio *nextbi = > - r5_next_bio(bi, sh->dev[i].sector); > + r5_next_bio(conf, bi, sh->dev[i].sector); > clear_bit(BIO_UPTODATE, &bi->bi_flags); > if (!raid5_dec_bi_active_stripes(bi)) { > bi->bi_next = *return_bi; > @@ -2822,7 +3040,7 @@ handle_failed_stripe(struct r5conf *conf > } > if (bitmap_end) > bitmap_endwrite(conf->mddev->bitmap, sh->sector, > - STRIPE_SECTORS, 0, 0); > + STRIPE_SECTORS(conf), 0, 0); > /* If we were in the middle of a write the parity block might > * still be locked - so just clear all R5_LOCKED flags > */ > @@ -2863,21 +3081,21 @@ handle_failed_sync(struct r5conf *conf, > && !test_bit(Faulty, &rdev->flags) > && !test_bit(In_sync, &rdev->flags) > && !rdev_set_badblocks(rdev, sh->sector, > - STRIPE_SECTORS, 0)) > + STRIPE_SECTORS(conf), 0)) > abort = 1; > rdev = conf->disks[i].replacement; > if (rdev > && !test_bit(Faulty, &rdev->flags) > && !test_bit(In_sync, &rdev->flags) > && !rdev_set_badblocks(rdev, sh->sector, > - STRIPE_SECTORS, 0)) > + STRIPE_SECTORS(conf), 0)) > abort = 1; > } > if (abort) > conf->recovery_disabled = > conf->mddev->recovery_disabled; > } > - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); > + md_done_sync(conf->mddev, STRIPE_SECTORS(conf), !abort); > } > > static int want_replace(struct stripe_head *sh, int disk_idx) > @@ -3036,13 +3254,13 @@ static void handle_stripe_clean_event(st > clear_bit(R5_UPTODATE, &dev->flags); > if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { > WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); > - dev->page = dev->orig_page; > + reset_stripe_devpage(sh, i); > } > wbi = dev->written; > dev->written = NULL; > while (wbi && wbi->bi_iter.bi_sector < > - dev->sector + STRIPE_SECTORS) { > - wbi2 = r5_next_bio(wbi, dev->sector); > + dev->sector + STRIPE_SECTORS(conf)) { > + wbi2 = r5_next_bio(conf, wbi, dev->sector); > if (!raid5_dec_bi_active_stripes(wbi)) { > md_write_end(conf->mddev); > wbi->bi_next = *return_bi; > @@ -3051,13 +3269,13 @@ static void handle_stripe_clean_event(st > wbi = wbi2; > } > bitmap_endwrite(conf->mddev->bitmap, sh->sector, > - STRIPE_SECTORS, > + STRIPE_SECTORS(conf), > !test_bit(STRIPE_DEGRADED, &sh->state), > 0); > } else if (test_bit(R5_Discard, &dev->flags)) > discard_pending = 1; > WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); > - WARN_ON(dev->page != dev->orig_page); > + WARN_ON(dev->pages[0] != dev->orig_pages[0]); > } > if (!discard_pending && > test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { > @@ -3274,7 +3492,7 @@ static void handle_parity_checks5(struct > */ > set_bit(STRIPE_INSYNC, &sh->state); > else { > - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); > + atomic64_add(STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); > if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) > /* don't try to repair!! */ > set_bit(STRIPE_INSYNC, &sh->state); > @@ -3426,7 +3644,7 @@ static void handle_parity_checks6(struct > */ > } > } else { > - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); > + atomic64_add(STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); > if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) > /* don't try to repair!! */ > set_bit(STRIPE_INSYNC, &sh->state); > @@ -3466,7 +3684,7 @@ static void handle_parity_checks6(struct > > static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) > { > - int i; > + int i, k; > > /* We have read all the blocks in this stripe and now we need to > * copy some of them into a target stripe for expand. > @@ -3496,11 +3714,13 @@ static void handle_stripe_expansion(stru > continue; > } > > - /* place all the copies on one channel */ > - init_async_submit(&submit, 0, tx, NULL, NULL, NULL); > - tx = async_memcpy(sh2->dev[dd_idx].page, > - sh->dev[i].page, 0, 0, STRIPE_SIZE, > - &submit); > + for (k = 0; k < STRIPE_PAGES(sh->raid_conf); k++) { > + /* place all the copies on one channel */ > + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); > + tx = async_memcpy(sh2->dev[dd_idx].pages[k], > + sh->dev[i].pages[k], 0, 0, PAGE_SIZE, > + &submit); > + } > > set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); > set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); > @@ -3597,8 +3817,8 @@ static void analyse_stripe(struct stripe > */ > rdev = rcu_dereference(conf->disks[i].replacement); > if (rdev && !test_bit(Faulty, &rdev->flags) && > - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && > - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, > + rdev->recovery_offset >= sh->sector + STRIPE_SECTORS(conf) && > + !is_badblock(rdev, sh->sector, STRIPE_SECTORS(conf), > &first_bad, &bad_sectors)) > set_bit(R5_ReadRepl, &dev->flags); > else { > @@ -3610,7 +3830,7 @@ static void analyse_stripe(struct stripe > if (rdev && test_bit(Faulty, &rdev->flags)) > rdev = NULL; > if (rdev) { > - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, > + is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS(conf), > &first_bad, &bad_sectors); > if (s->blocked_rdev == NULL > && (test_bit(Blocked, &rdev->flags) > @@ -3637,7 +3857,7 @@ static void analyse_stripe(struct stripe > } > } else if (test_bit(In_sync, &rdev->flags)) > set_bit(R5_Insync, &dev->flags); > - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) > + else if (sh->sector + STRIPE_SECTORS(conf) <= rdev->recovery_offset) > /* in sync if before recovery_offset */ > set_bit(R5_Insync, &dev->flags); > else if (test_bit(R5_UPTODATE, &dev->flags) && > @@ -3903,7 +4123,7 @@ static void handle_stripe(struct stripe_ > if ((s.syncing || s.replacing) && s.locked == 0 && > !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && > test_bit(STRIPE_INSYNC, &sh->state)) { > - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); > + md_done_sync(conf->mddev, STRIPE_SECTORS(conf), 1); > clear_bit(STRIPE_SYNCING, &sh->state); > if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) > wake_up(&conf->wait_for_overlap); > @@ -3972,7 +4192,7 @@ static void handle_stripe(struct stripe_ > clear_bit(STRIPE_EXPAND_READY, &sh->state); > atomic_dec(&conf->reshape_stripes); > wake_up(&conf->wait_for_overlap); > - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); > + md_done_sync(conf->mddev, STRIPE_SECTORS(conf), 1); > } > > if (s.expanding && s.locked == 0 && > @@ -4002,14 +4222,14 @@ finish: > /* We own a safe reference to the rdev */ > rdev = conf->disks[i].rdev; > if (!rdev_set_badblocks(rdev, sh->sector, > - STRIPE_SECTORS, 0)) > + STRIPE_SECTORS(conf), 0)) > md_error(conf->mddev, rdev); > rdev_dec_pending(rdev, conf->mddev); > } > if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { > rdev = conf->disks[i].rdev; > rdev_clear_badblocks(rdev, sh->sector, > - STRIPE_SECTORS, 0); > + STRIPE_SECTORS(conf), 0); > rdev_dec_pending(rdev, conf->mddev); > } > if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { > @@ -4018,7 +4238,7 @@ finish: > /* rdev have been moved down */ > rdev = conf->disks[i].rdev; > rdev_clear_badblocks(rdev, sh->sector, > - STRIPE_SECTORS, 0); > + STRIPE_SECTORS(conf), 0); > rdev_dec_pending(rdev, conf->mddev); > } > } > @@ -4502,7 +4722,7 @@ static void make_discard_request(struct > /* Skip discard while reshape is happening */ > return; > > - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); > + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS(conf)-1); > last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); > > bi->bi_next = NULL; > @@ -4518,7 +4738,7 @@ static void make_discard_request(struct > last_sector *= conf->chunk_sectors; > > for (; logical_sector < last_sector; > - logical_sector += STRIPE_SECTORS) { > + logical_sector += STRIPE_SECTORS(conf)) { > DEFINE_WAIT(w); > int d; > again: > @@ -4560,7 +4780,7 @@ static void make_discard_request(struct > d++) > bitmap_startwrite(mddev->bitmap, > sh->sector, > - STRIPE_SECTORS, > + STRIPE_SECTORS(conf), > 0); > sh->bm_seq = conf->seq_flush + 1; > set_bit(STRIPE_BIT_DELAY, &sh->state); > @@ -4609,13 +4829,13 @@ static void make_request(struct mddev *m > return; > } > > - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); > + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS(conf)-1); > last_sector = bio_end_sector(bi); > bi->bi_next = NULL; > bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ > > prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); > - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { > + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS(conf)) { > int previous; > int seq; > > @@ -4895,7 +5115,7 @@ static sector_t reshape_request(struct m > } > > INIT_LIST_HEAD(&stripes); > - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { > + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS(conf)) { > int j; > int skipped_disk = 0; > sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); > @@ -4906,6 +5126,7 @@ static sector_t reshape_request(struct m > */ > for (j=sh->disks; j--;) { > sector_t s; > + int k; > if (j == sh->pd_idx) > continue; > if (conf->level == 6 && > @@ -4916,7 +5137,8 @@ static sector_t reshape_request(struct m > skipped_disk = 1; > continue; > } > - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); > + for (k = 0; k < STRIPE_PAGES(conf); k++) > + memset(page_address(sh->dev[j].pages[k]), 0, PAGE_SIZE); > set_bit(R5_Expanded, &sh->dev[j].flags); > set_bit(R5_UPTODATE, &sh->dev[j].flags); > } > @@ -4951,7 +5173,7 @@ static sector_t reshape_request(struct m > set_bit(STRIPE_EXPAND_SOURCE, &sh->state); > set_bit(STRIPE_HANDLE, &sh->state); > release_stripe(sh); > - first_sector += STRIPE_SECTORS; > + first_sector += STRIPE_SECTORS(conf); > } > /* Now that the sources are clearly marked, we can release > * the destination stripes > @@ -5046,11 +5268,11 @@ static inline sector_t sync_request(stru > if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && > !conf->fullsync && > !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && > - sync_blocks >= STRIPE_SECTORS) { > + sync_blocks >= STRIPE_SECTORS(conf)) { > /* we can skip this block, and probably more */ > - sync_blocks /= STRIPE_SECTORS; > + sync_blocks /= STRIPE_SECTORS(conf); > *skipped = 1; > - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ > + return sync_blocks * STRIPE_SECTORS(conf); /* keep things rounded to whole stripes */ > } > > bitmap_cond_end_sync(mddev->bitmap, sector_nr); > @@ -5078,7 +5300,7 @@ static inline sector_t sync_request(stru > > release_stripe(sh); > > - return STRIPE_SECTORS; > + return STRIPE_SECTORS(conf); > } > > static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) > @@ -5101,14 +5323,14 @@ static int retry_aligned_read(struct r5 > int handled = 0; > > logical_sector = raid_bio->bi_iter.bi_sector & > - ~((sector_t)STRIPE_SECTORS-1); > + ~((sector_t)STRIPE_SECTORS(conf)-1); > sector = raid5_compute_sector(conf, logical_sector, > 0, &dd_idx, NULL); > last_sector = bio_end_sector(raid_bio); > > for (; logical_sector < last_sector; > - logical_sector += STRIPE_SECTORS, > - sector += STRIPE_SECTORS, > + logical_sector += STRIPE_SECTORS(conf), > + sector += STRIPE_SECTORS(conf), > scnt++) { > > if (scnt < raid5_bi_processed_stripes(raid_bio)) > @@ -5607,20 +5829,42 @@ raid5_size(struct mddev *mddev, sector_t > > static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) > { > - safe_put_page(percpu->spare_page); > + int i; > + if (percpu->spare_pages) { > + for (i = 0; i < STRIPE_PAGES(conf); i++) > + safe_put_page(percpu->spare_pages[i]); > + kfree(percpu->spare_pages); > + } > kfree(percpu->scribble); > - percpu->spare_page = NULL; > + percpu->spare_pages = NULL; > percpu->scribble = NULL; > } > > static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) > { > - if (conf->level == 6 && !percpu->spare_page) > - percpu->spare_page = alloc_page(GFP_KERNEL); > + bool sp_alloc_fail = false; > + if (conf->level == 6 && !percpu->spare_pages) { > + struct page **pages; > + int i; > + > + pages = kzalloc(sizeof(struct page *) * STRIPE_PAGES(conf), > + GFP_KERNEL); > + sp_alloc_fail = true; > + if (pages) { > + percpu->spare_pages = pages; > + for (i = 0; i < STRIPE_PAGES(conf); i++) { > + pages[i] = alloc_page(GFP_KERNEL); > + if (!pages[i]) > + break; > + } > + if (i == STRIPE_PAGES(conf)) > + sp_alloc_fail = false; > + } > + } > if (!percpu->scribble) > percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); > > - if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { > + if (!percpu->scribble || sp_alloc_fail) { > free_scratch_buffer(conf, percpu); > return -ENOMEM; > } > @@ -5788,7 +6032,7 @@ static struct r5conf *setup_conf(struct > else > conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; > max_disks = max(conf->raid_disks, conf->previous_raid_disks); > - conf->scribble_len = scribble_len(max_disks); > + conf->scribble_len = scribble_len(conf, max_disks); > > conf->disks = kzalloc(max_disks * sizeof(struct disk_info), > GFP_KERNEL); > @@ -6512,14 +6756,25 @@ static int check_stripe_cache(struct mdd > * stripe_heads first. > */ > struct r5conf *conf = mddev->private; > - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 > + > + /* > + * stripe size is bigger than chunk size is possible, but not very > + * useful. We don't allow it at this point. > + */ > + if ((mddev->new_chunk_sectors << 9) < STRIPE_SIZE(conf)) { > + printk(KERN_WARNING > + "md/raid:%s: reshape: chunk size is smaller than stripe cache size\n", > + mdname(mddev)); > + return 0; > + } > + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE(conf)) * 4 > > conf->max_nr_stripes || > - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE(conf)) * 4 > > conf->max_nr_stripes) { > printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", > mdname(mddev), > ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) > - / STRIPE_SIZE)*4); > + / STRIPE_SIZE(conf))*4); > return 0; > } > return 1; > @@ -6827,6 +7082,7 @@ static void *raid45_takeover_raid0(struc > static void *raid5_takeover_raid1(struct mddev *mddev) > { > int chunksect; > + struct r5conf *conf = mddev->private; > > if (mddev->raid_disks != 2 || > mddev->degraded > 1) > @@ -6840,7 +7096,7 @@ static void *raid5_takeover_raid1(struct > while (chunksect && (mddev->array_sectors & (chunksect-1))) > chunksect >>= 1; > > - if ((chunksect<<9) < STRIPE_SIZE) > + if ((chunksect<<9) < STRIPE_SIZE(conf)) > /* array size does not allow a suitable chunk size */ > return ERR_PTR(-EINVAL); > > Index: linux/drivers/md/raid5.h > =================================================================== > --- linux.orig/drivers/md/raid5.h 2014-07-23 14:09:45.844570945 +0800 > +++ linux/drivers/md/raid5.h 2014-07-23 14:09:45.836571048 +0800 > @@ -225,14 +225,15 @@ struct stripe_head { > struct stripe_operations { > int target, target2; > enum sum_check_flags zero_sum_result; > + enum sum_check_flags *sum_results; > } ops; > struct r5dev { > /* rreq and rvec are used for the replacement device when > * writing data to both devices. > */ > struct bio req, rreq; > - struct bio_vec vec, rvec; > - struct page *page, *orig_page; > + struct bio_vec *vecs, *rvecs; > + struct page **pages, **orig_pages; > struct bio *toread, *read, *towrite, *written; > sector_t sector; /* sector of this page */ > unsigned long flags; > @@ -458,7 +459,7 @@ struct r5conf { > int recovery_disabled; > /* per cpu variables */ > struct raid5_percpu { > - struct page *spare_page; /* Used when checking P/Q in raid6 */ > + struct page **spare_pages; /* Used when checking P/Q in raid6 */ > void *scribble; /* space for constructing buffer > * lists and performing address > * conversions > @@ -487,6 +488,7 @@ struct r5conf { > int pool_size; /* number of disks in stripeheads in pool */ > spinlock_t device_lock; > struct disk_info *disks; > + int stripe_size_order; > > /* When taking over an array from a different personality, we store > * the new thread here until we fully activate the array.
Attachment:
signature.asc
Description: PGP signature