Re: md raid10 Oops on recent kernels

NeilBrown <neilb@xxxxxxx> · Wed, 15 Aug 2012 14:44:01 +1000

On Tue, 14 Aug 2012 22:56:59 +0400 Ivan Vasilyev <ivan.vasilyev@xxxxxxxxx>
wrote:

> 2012/8/14 NeilBrown <neilb@xxxxxxx>:
> > On Mon, 13 Aug 2012 16:49:26 +0400 Ivan Vasilyev <ivan.vasilyev@xxxxxxxxx>
> > wrote:
> >
> >>  ---[ end trace b86c49ca25a6cdb2 ]---
> >> ----------
> >
> > It looks like the ->merge_bvec_fn is bad - the code is jumping to
> > 0xffffffff00000001, which strongly suggests some function pointer is bad, and
> > merge_bvec_fn is the only one in that area of code.
> > However I cannot see how it could possibly get a bad value like that.
> >
> > There were changes to merge_bvec_fn handling in RAID10 in 3.4 which is when
> > you say the problem appeared.  However I cannot see how direct IO would be
> > affected any differently to normal IO.
> >
> > If I were to try to debug this I'd build a kernel and put a printk in
> > __bio_add_page in fs/bio.c just before calling q->merge_bvec_fn to print a
> > message if that value has the low bit set. (i.e. if (q->merge_bvec_fn & 1) ...).
> 
> Such printk is triggered right befire oops:
> 
> DEBUG q-> merge_bvec_fn=0xffffffffa011a1c3 queue_flags=0x40
> queuedata=0xffff880058bf1520
> backing_dev_info.congested_fn=0xffffffffa011d39a
> BUG: unable to handle kernel paging request at ffffffff00000001
> 
> although address is different (so this means the bug does not occur
> exactly on merge_bvec_fn() call?)
> 
> Checked again - this problem affects only directIO:
> 
> dd if=/dev/md/rtest_a count=10000 of=/dev/null
>  => ok
> dd if=/dev/md/rtest_a iflag=direct count=10000 of=/dev/null
>  => oops (first since boot)
> 

Hmmm.. not what I expected.  
I found it was indeed easy to reproduce and after being sure it was
impossible for half the afternoon I've found the problem.
The following patch fixes it.  I'm not sure yet if that it was what I'll
submit upstream.
The problem is that "struct r10bio" isn't by itself big enough.  It is
usually allocated with extra memory at the end.  So when declared on the
stack, the same is needed.

Thanks,
NeilBrown

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 93fe561..12565c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -659,7 +659,10 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 		max = biovec->bv_len;
 
 	if (mddev->merge_check_needed) {
-		struct r10bio r10_bio;
+		struct {
+			struct r10bio r10_bio;
+			struct r10dev devs[conf->copies];
+		} x;
 		int s;
 		if (conf->reshape_progress != MaxSector) {
 			/* Cannot give any guidance during reshape */
@@ -667,18 +670,18 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 				return biovec->bv_len;
 			return 0;
 		}
-		r10_bio.sector = sector;
-		raid10_find_phys(conf, &r10_bio);
+		x.r10_bio.sector = sector;
+		raid10_find_phys(conf, &x.r10_bio);
 		rcu_read_lock();
 		for (s = 0; s < conf->copies; s++) {
-			int disk = r10_bio.devs[s].devnum;
+			int disk = x.r10_bio.devs[s].devnum;
 			struct md_rdev *rdev = rcu_dereference(
 				conf->mirrors[disk].rdev);
 			if (rdev && !test_bit(Faulty, &rdev->flags)) {
 				struct request_queue *q =
 					bdev_get_queue(rdev->bdev);
 				if (q->merge_bvec_fn) {
-					bvm->bi_sector = r10_bio.devs[s].addr
+					bvm->bi_sector = x.r10_bio.devs[s].addr
 						+ rdev->data_offset;
 					bvm->bi_bdev = rdev->bdev;
 					max = min(max, q->merge_bvec_fn(
@@ -690,7 +693,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 				struct request_queue *q =
 					bdev_get_queue(rdev->bdev);
 				if (q->merge_bvec_fn) {
-					bvm->bi_sector = r10_bio.devs[s].addr
+					bvm->bi_sector = x.r10_bio.devs[s].addr
 						+ rdev->data_offset;
 					bvm->bi_bdev = rdev->bdev;
 					max = min(max, q->merge_bvec_fn(
@@ -4434,14 +4437,17 @@ static int handle_reshape_read_error(struct mddev *mddev,
 {
 	/* Use sync reads to get the blocks from somewhere else */
 	int sectors = r10_bio->sectors;
-	struct r10bio r10b;
 	struct r10conf *conf = mddev->private;
+	struct {
+		struct r10bio r10b;
+		struct r10dev devs[conf->copies];
+	} x;
 	int slot = 0;
 	int idx = 0;
 	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
 
-	r10b.sector = r10_bio->sector;
-	__raid10_find_phys(&conf->prev, &r10b);
+	x.r10b.sector = r10_bio->sector;
+	__raid10_find_phys(&conf->prev, &x.r10b);
 
 	while (sectors) {
 		int s = sectors;
@@ -4452,7 +4458,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			s = PAGE_SIZE >> 9;
 
 		while (!success) {
-			int d = r10b.devs[slot].devnum;
+			int d = x.r10b.devs[slot].devnum;
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			sector_t addr;
 			if (rdev == NULL ||
@@ -4460,7 +4466,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			    !test_bit(In_sync, &rdev->flags))
 				goto failed;
 
-			addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
+			addr = x.r10b.devs[slot].addr + idx * PAGE_SIZE;
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 007c2c6..1054cf6 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -110,7 +110,7 @@ struct r10bio {
 	 * We choose the number when they are allocated.
 	 * We sometimes need an extra bio to write to the replacement.
 	 */
-	struct {
+	struct r10dev {
 		struct bio	*bio;
 		union {
 			struct bio	*repl_bio; /* used for resync and
Attachment:
signature.asc

Description: PGP signature