> From: Yuri Tikhonov [mailto:yur@xxxxxxxxxxx] > Hello, > > I tested the h/w accelerated RAID-5 using the kernel with PAGE_SIZE set to > 64KB and found the bonnie++ application hangs-up during the "Re-writing" > test. I made some investigations and discovered that the hang-up occurs > because one of the mpage_end_io_read() calls is missing (these are the > callbacks initiated from the ops_complete_biofill() function). > > The fact is that my low-level ADMA driver (the ppc440spe one) successfully > initiated the ops_complete_biofill() callback but the ops_complete_biofill() > function itself skipped calling the bi_end_io() handler of the completed bio > (current dev->read) because during processing of this (current dev->read) bio > some other request had come to the sh (current dev_q->toread). Thus > ops_complete_biofill() scheduled another biofill operation which, as a > result, overwrote the unacknowledged bio (dev->read in ops_run_biofill()), > and so we lost the previous dev->read bio completely. > > Here is a patch that solves this problem. Perhaps this might be implemented > in some more elegant and effective way. What are your thoughts regarding > this? > > Regards, Yuri > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index 08b4893..7abc96b 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -838,11 +838,24 @@ static void ops_complete_biofill(void *stripe_head_ref) > /* acknowledge completion of a biofill operation */ > /* and check if we need to reply to a read request > */ > - if (test_bit(R5_Wantfill, &dev_q->flags) && !dev_q->toread) { > + if (test_bit(R5_Wantfill, &dev_q->flags)) { > struct bio *rbi, *rbi2; > struct r5dev *dev = &sh->dev[i]; > > - clear_bit(R5_Wantfill, &dev_q->flags); > + /* There is a chance that another fill operation > + * had been scheduled for this dev while we > + * processed sh. In this case do one of the following > + * alternatives: > + * - if there is no active completed biofill for the dev > + * then go to the next dev leaving Wantfill set; > + * - if there is active completed biofill for the dev > + * then ack it but leave Wantfill set. > + */ > + if (dev_q->toread && !dev->read) > + continue; > + > + if (!dev_q->toread) > + clear_bit(R5_Wantfill, &dev_q->flags); > > /* The access to dev->read is outside of the > * spin_lock_irq(&conf->device_lock), but is protected This still looks racy... I think the complete fix is to make the R5_Wantfill and dev_q->toread accesses atomic. Please test the following patch (also attached) and let me know if it fixes what you are seeing: Applies on top of git://lost.foo-projects.org/~dwillia2/git/iop md-for-linus --- raid5: fix ops_complete_biofill race in the asynchronous offload case Protect against dev_q->toread toggling while testing and clearing the R5_Wantfill bit. This change prevents all asynchronous completions (tasklets) from running during handle_stripe. --- drivers/md/raid5.c | 16 +++++++++------- 1 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2f9022d..91c14c6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -824,6 +824,7 @@ static void ops_complete_biofill(void *stripe_head_ref) (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock(&sq->lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct r5_queue_dev *dev_q = &sq->dev[i]; @@ -861,6 +862,7 @@ static void ops_complete_biofill(void *stripe_head_ref) } clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + spin_unlock(&sq->lock); return_io(return_bi); @@ -2279,7 +2281,7 @@ static int add_queue_bio(struct stripe_queue *sq, struct bio *bi, int dd_idx, (unsigned long long)bi->bi_sector, (unsigned long long)sq->sector); - spin_lock(&sq->lock); + spin_lock_bh(&sq->lock); spin_lock_irq(&conf->device_lock); sh = sq->sh; if (forwrite) { @@ -2306,7 +2308,7 @@ static int add_queue_bio(struct stripe_queue *sq, struct bio *bi, int dd_idx, *bip = bi; bi->bi_phys_segments ++; spin_unlock_irq(&conf->device_lock); - spin_unlock(&sq->lock); + spin_unlock_bh(&sq->lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)bi->bi_sector, @@ -2339,7 +2341,7 @@ static int add_queue_bio(struct stripe_queue *sq, struct bio *bi, int dd_idx, overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&conf->device_lock); - spin_unlock(&sq->lock); + spin_unlock_bh(&sq->lock); return 0; } @@ -3127,7 +3129,7 @@ static void handle_stripe5(struct stripe_head *sh) atomic_read(&sh->count), sq->pd_idx, sh->ops.pending, sh->ops.ack, sh->ops.complete); - spin_lock(&sq->lock); + spin_lock_bh(&sq->lock); clear_bit(STRIPE_HANDLE, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state); @@ -3370,7 +3372,7 @@ static void handle_stripe5(struct stripe_head *sh) if (sh->ops.count) pending = get_stripe_work(sh); - spin_unlock(&sq->lock); + spin_unlock_bh(&sq->lock); if (pending) raid5_run_ops(sh, pending); @@ -3397,7 +3399,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) atomic_read(&sh->count), pd_idx, r6s.qd_idx); memset(&s, 0, sizeof(s)); - spin_lock(&sq->lock); + spin_lock_bh(&sq->lock); clear_bit(STRIPE_HANDLE, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state); @@ -3576,7 +3578,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanding && s.locked == 0) handle_stripe_expansion(conf, sh, &r6s); - spin_unlock(&sq->lock); + spin_unlock_bh(&sq->lock); return_io(return_bi);
Attachment:
md-accel-fix-ops-complete-biofill-race.patch
Description: md-accel-fix-ops-complete-biofill-race.patch