RE: md raid acceleration and the async_tx api

"Williams, Dan J" <dan.j.williams@xxxxxxxxx> · Mon, 27 Aug 2007 12:12:47 -0700

> From: Yuri Tikhonov [mailto:yur@xxxxxxxxxxx]
>  Hello,
> 
>  I tested the h/w accelerated RAID-5 using the kernel with PAGE_SIZE
set to
> 64KB and found the bonnie++ application hangs-up during the
"Re-writing"
> test. I made some investigations and discovered that the hang-up
occurs
> because one of the mpage_end_io_read() calls is missing (these are the
> callbacks initiated from the ops_complete_biofill() function).
> 
>  The fact is that my low-level ADMA driver (the ppc440spe one)
successfully
> initiated the ops_complete_biofill() callback but the
ops_complete_biofill()
> function itself skipped calling the bi_end_io() handler of the
completed bio
> (current dev->read) because during processing of this (current
dev->read) bio
> some other request had come to the sh (current dev_q->toread). Thus
> ops_complete_biofill() scheduled another biofill operation which, as a
> result, overwrote the unacknowledged bio (dev->read in
ops_run_biofill()),
> and so we lost the previous dev->read bio completely.
> 
>  Here is a patch that solves this problem. Perhaps this might be
implemented
> in some more elegant and effective way. What are your thoughts
regarding
> this?
> 
>  Regards, Yuri
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 08b4893..7abc96b 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -838,11 +838,24 @@ static void ops_complete_biofill(void
*stripe_head_ref)
>  		/* acknowledge completion of a biofill operation */
>  		/* and check if we need to reply to a read request
>  		 */
> -		if (test_bit(R5_Wantfill, &dev_q->flags) &&
!dev_q->toread) {
> +		if (test_bit(R5_Wantfill, &dev_q->flags)) {
>  			struct bio *rbi, *rbi2;
>  			struct r5dev *dev = &sh->dev[i];
> 
> -			clear_bit(R5_Wantfill, &dev_q->flags);
> +			/* There is a chance that another fill operation
> +			 * had been scheduled for this dev while we
> +			 * processed sh. In this case do one of the
following
> +			 * alternatives:
> +			 * - if there is no active completed biofill for
the dev
> +			 *   then go to the next dev leaving Wantfill
set;
> +			 * - if there is active completed biofill for
the dev
> +			 *   then ack it but leave Wantfill set.
> +			 */
> +			if (dev_q->toread && !dev->read)
> +				continue;
> +
> +			if (!dev_q->toread)
> +				clear_bit(R5_Wantfill, &dev_q->flags);
> 
>  			/* The access to dev->read is outside of the
>  			 * spin_lock_irq(&conf->device_lock), but is
protected

This still looks racy...  I think the complete fix is to make the
R5_Wantfill and dev_q->toread accesses atomic.  Please test the
following patch (also attached) and let me know if it fixes what you are
seeing:

Applies on top of git://lost.foo-projects.org/~dwillia2/git/iop
md-for-linus

---
raid5: fix ops_complete_biofill race in the asynchronous offload case

Protect against dev_q->toread toggling while testing and clearing the
R5_Wantfill bit.  This change prevents all asynchronous completions
(tasklets) from running during handle_stripe.
---

 drivers/md/raid5.c |   16 +++++++++-------
 1 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2f9022d..91c14c6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -824,6 +824,7 @@ static void ops_complete_biofill(void
*stripe_head_ref)
 		(unsigned long long)sh->sector);
 
 	/* clear completed biofills */
+	spin_lock(&sq->lock);
 	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		struct r5_queue_dev *dev_q = &sq->dev[i];
@@ -861,6 +862,7 @@ static void ops_complete_biofill(void
*stripe_head_ref)
 	}
 	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
 	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+	spin_unlock(&sq->lock);
 
 	return_io(return_bi);
 
@@ -2279,7 +2281,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
 		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sq->sector);
 
-	spin_lock(&sq->lock);
+	spin_lock_bh(&sq->lock);
 	spin_lock_irq(&conf->device_lock);
 	sh = sq->sh;
 	if (forwrite) {
@@ -2306,7 +2308,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
 	*bip = bi;
 	bi->bi_phys_segments ++;
 	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sq->lock);
+	spin_unlock_bh(&sq->lock);
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
 		(unsigned long long)bi->bi_sector,
@@ -2339,7 +2341,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
  overlap:
 	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
 	spin_unlock_irq(&conf->device_lock);
-	spin_unlock(&sq->lock);
+	spin_unlock_bh(&sq->lock);
 	return 0;
 }
 
@@ -3127,7 +3129,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		atomic_read(&sh->count), sq->pd_idx,
 		sh->ops.pending, sh->ops.ack, sh->ops.complete);
 
-	spin_lock(&sq->lock);
+	spin_lock_bh(&sq->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
 
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3370,7 +3372,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (sh->ops.count)
 		pending = get_stripe_work(sh);
 
-	spin_unlock(&sq->lock);
+	spin_unlock_bh(&sq->lock);
 
 	if (pending)
 		raid5_run_ops(sh, pending);
@@ -3397,7 +3399,7 @@ static void handle_stripe6(struct stripe_head *sh,
struct page *tmp_page)
 	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
 	memset(&s, 0, sizeof(s));
 
-	spin_lock(&sq->lock);
+	spin_lock_bh(&sq->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
 
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3576,7 +3578,7 @@ static void handle_stripe6(struct stripe_head *sh,
struct page *tmp_page)
 	if (s.expanding && s.locked == 0)
 		handle_stripe_expansion(conf, sh, &r6s);
 
-	spin_unlock(&sq->lock);
+	spin_unlock_bh(&sq->lock);
 
 	return_io(return_bi);


Attachment:
md-accel-fix-ops-complete-biofill-race.patch

Description: md-accel-fix-ops-complete-biofill-race.patch