On Monday May 18, dan.j.williams@xxxxxxxxx wrote: > Hang some memory off of each stripe_head which can be used for storing > the buffer lists used in parity calculations. Include space for dma > address conversions and pass that to async_tx via the > async_submit_ctl.scribble pointer. > > [ Impact: move memory pressure from stack to heap ] I've finally had a look at this and I cannot say that I like it. We don't really need one scribble-buffer per stripe_head. And in fact, that isn't even enough because you find you need a mutex to avoid multiple-use. We really want one scribble-buffer per thread, or per CPU, or something like that. You could possibly handle it a bit like ->spare_page, though we cope with that being NULL some times, and you might not be able to do that with scribble-buffer. How do the async-raid6 patches cope with possible multiple users of ->spare_page now that the computations are async and so possible in parallel? Maybe a little mempool would be best?.... though given that in most cases, the stack solution is really quite adequate it would be good to make sure the replacement isn't too heavy-weight.... I'm not sure what would be best, but I really don't like the current proposal. NeilBrown > > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> > --- > drivers/md/raid5.c | 61 ++++++++++++++++++++++++++++++++++++++++++---------- > drivers/md/raid5.h | 5 ++++ > 2 files changed, 54 insertions(+), 12 deletions(-) > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index e1920f2..0e456a6 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -275,6 +275,9 @@ static void shrink_buffers(struct stripe_head *sh, int num) > struct page *p; > int i; > > + kfree(sh->scribble); > + sh->scribble = NULL; > + > for (i=0; i<num ; i++) { > p = sh->dev[i].page; > if (!p) > @@ -284,10 +287,26 @@ static void shrink_buffers(struct stripe_head *sh, int num) > } > } > > +static size_t scribble_len(int num) > +{ > + size_t len; > + > + /* return enough space for an array of page pointers and dma > + * addresses for the ddf raid6 layout > + */ > + len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); > + > + return len; > +} > + > static int grow_buffers(struct stripe_head *sh, int num) > { > int i; > > + sh->scribble = kmalloc(scribble_len(num), GFP_KERNEL); > + if (!sh->scribble) > + return 1; > + > for (i=0; i<num; i++) { > struct page *page; > > @@ -641,11 +660,16 @@ static void ops_complete_compute5(void *stripe_head_ref) > release_stripe(sh); > } > > +/* return a pointer to the address conversion region of the scribble buffer */ > +static addr_conv_t *sh_to_addr_conv(struct stripe_head *sh) > +{ > + return sh->scribble + sizeof(struct page *) * (sh->disks + 2); > +} > + > static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) > { > - /* kernel stack size limits the total number of disks */ > int disks = sh->disks; > - struct page *xor_srcs[disks]; > + struct page **xor_srcs = sh->scribble; > int target = sh->ops.target; > struct r5dev *tgt = &sh->dev[target]; > struct page *xor_dest = tgt->page; > @@ -665,7 +689,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) > atomic_inc(&sh->count); > > init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, > - ops_complete_compute5, sh, NULL); > + ops_complete_compute5, sh, sh_to_addr_conv(sh)); > if (unlikely(count == 1)) > tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); > else > @@ -685,9 +709,8 @@ static void ops_complete_prexor(void *stripe_head_ref) > static struct dma_async_tx_descriptor * > ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) > { > - /* kernel stack size limits the total number of disks */ > int disks = sh->disks; > - struct page *xor_srcs[disks]; > + struct page **xor_srcs = sh->scribble; > int count = 0, pd_idx = sh->pd_idx, i; > struct async_submit_ctl submit; > > @@ -705,7 +728,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) > } > > init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx, > - ops_complete_prexor, sh, NULL); > + ops_complete_prexor, sh, sh_to_addr_conv(sh)); > tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); > > return tx; > @@ -776,9 +799,8 @@ static void ops_complete_postxor(void *stripe_head_ref) > static void > ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) > { > - /* kernel stack size limits the total number of disks */ > int disks = sh->disks; > - struct page *xor_srcs[disks]; > + struct page **xor_srcs = sh->scribble; > struct async_submit_ctl submit; > int count = 0, pd_idx = sh->pd_idx, i; > struct page *xor_dest; > @@ -818,7 +840,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) > > atomic_inc(&sh->count); > > - init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL); > + init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, > + sh_to_addr_conv(sh)); > if (unlikely(count == 1)) > tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); > else > @@ -839,9 +862,8 @@ static void ops_complete_check(void *stripe_head_ref) > > static void ops_run_check(struct stripe_head *sh) > { > - /* kernel stack size limits the total number of disks */ > int disks = sh->disks; > - struct page *xor_srcs[disks]; > + struct page **xor_srcs = sh->scribble; > struct dma_async_tx_descriptor *tx; > struct async_submit_ctl submit; > > @@ -857,7 +879,7 @@ static void ops_run_check(struct stripe_head *sh) > xor_srcs[count++] = dev->page; > } > > - init_async_submit(&submit, 0, NULL, NULL, NULL, NULL); > + init_async_submit(&submit, 0, NULL, NULL, NULL, sh_to_addr_conv(sh)); > tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, > &sh->ops.zero_sum_result, &submit); > > @@ -871,6 +893,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) > int overlap_clear = 0, i, disks = sh->disks; > struct dma_async_tx_descriptor *tx = NULL; > > + mutex_lock(&sh->scribble_lock); > if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { > ops_run_biofill(sh); > overlap_clear++; > @@ -903,6 +926,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) > if (test_and_clear_bit(R5_Overlap, &dev->flags)) > wake_up(&sh->raid_conf->wait_for_overlap); > } > + mutex_unlock(&sh->scribble_lock); > } > > static int grow_one_stripe(raid5_conf_t *conf) > @@ -914,6 +938,7 @@ static int grow_one_stripe(raid5_conf_t *conf) > memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); > sh->raid_conf = conf; > spin_lock_init(&sh->lock); > + mutex_init(&sh->scribble_lock); > > if (grow_buffers(sh, conf->raid_disks)) { > shrink_buffers(sh, conf->raid_disks); > @@ -1007,6 +1032,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) > > nsh->raid_conf = conf; > spin_lock_init(&nsh->lock); > + mutex_init(&nsh->scribble_lock); > > list_add(&nsh->lru, &newstripes); > } > @@ -1038,6 +1064,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) > nsh->dev[i].page = osh->dev[i].page; > for( ; i<newsize; i++) > nsh->dev[i].page = NULL; > + nsh->scribble = osh->scribble; > kmem_cache_free(conf->slab_cache, osh); > } > kmem_cache_destroy(conf->slab_cache); > @@ -1058,8 +1085,18 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) > > /* Step 4, return new stripes to service */ > while(!list_empty(&newstripes)) { > + void *scribble; > + > nsh = list_entry(newstripes.next, struct stripe_head, lru); > list_del_init(&nsh->lru); > + > + scribble = kmalloc(scribble_len(newsize), GFP_NOIO); > + if (scribble) { > + kfree(nsh->scribble); > + nsh->scribble = scribble; > + } else > + err = -ENOMEM; > + > for (i=conf->raid_disks; i < newsize; i++) > if (nsh->dev[i].page == NULL) { > struct page *p = alloc_page(GFP_NOIO); > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h > index 52ba999..6ab0ccd 100644 > --- a/drivers/md/raid5.h > +++ b/drivers/md/raid5.h > @@ -211,6 +211,11 @@ struct stripe_head { > int disks; /* disks in stripe */ > enum check_states check_state; > enum reconstruct_states reconstruct_state; > + void *scribble; /* space for constructing buffer > + * lists and performing address > + * conversions > + */ > + struct mutex scribble_lock; /* no concurrent scribbling */ > /* stripe_operations > * @target - STRIPE_OP_COMPUTE_BLK target > */ -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html