We have a bug report about NULL dereference in md raid10 code. The operations are, - Create a raid10 device - add a spare disk - disconnect one of the online disks from the raid10 device - wait to the recovery happens on the spare disk - remove the spare disk which is recovering And sometimes a kernel oops of NULL dereference in md raid10 module can be observed, and crash tool reports the following information: > (gdb) list *(raid10d+0xad6) > 0x5de6 is in raid10d (../drivers/md/raid10.c:2300). > 2295 * the latter is free to free wbio2. > 2296 */ > 2297 if (wbio2 && !wbio2->bi_end_io) > 2298 wbio2 = NULL; > 2299 if (wbio->bi_end_io) { > 2300 atomic_inc(&conf->mirrors[d].rdev->nr_pending); > 2301 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); > 2302 generic_make_request(wbio); > 2303 } > 2304 if (wbio2) { At line 2300, conf->mirrors[d].rdev is NULL, this causes a NULL pointer dereference to conf->mirrors[d].rdev->nr_pending. After reading previous raid10 patches, I find conf->mirrors[d].rdev can be NULL at any point unless, - a counted reference is held - ->reconfig_mutex is held, or - rcu_read_lock() is held In context of kernel thread raid10d, non of the above conditions happens, therefore using rcu to protect the access to conf->mirrors[].rdev and conf->mirrors[].replacement pointers is necessary. This patch is an effort to add rcu protection in raid10d() code path, which including the following sub-routines, - handle_write_completed() - reshape_request_write() - sync_request_write() - recovery_request_write() Because the reported issue can not be always reproduced, I am still working on verification. This RFC patch is sent out for early feedback in case there is something I missed in the fix. Thanks in advance for the review and comments. Signed-off-by: Coly Li <colyli@xxxxxxx> Reported-by: Tomasz Majchrzak <tomasz.majchrzak@xxxxxxxxx> --- drivers/md/raid10.c | 99 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 374df5796649..fe9ce25ffc08 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2041,13 +2041,25 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev == NULL) { + rcu_read_unlock(); + continue; + } + if (!r10_bio->devs[i].bio->bi_status) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; */ int sectors = r10_bio->sectors; + + /* + * rdev is not referenced here, don't need rcu lock + * any more. + */ + rcu_read_unlock(); for (j = 0; j < vcnt; j++) { int len = PAGE_SIZE; if (sectors < (len / 512)) @@ -2067,8 +2079,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) } else if (test_bit(FailFast, &rdev->flags)) { /* Just give up on this device */ md_error(rdev->mddev, rdev); + rcu_read_unlock(); continue; } + rcu_read_unlock(); /* Ok, we need to write this bio, either to correct an * inconsistency or to correct an unreadable block. * First we need to fixup bv_offset, bv_len and @@ -2087,14 +2101,21 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) bio_copy_data(tbio, fbio); - atomic_inc(&conf->mirrors[d].rdev->nr_pending); + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev == NULL) { + rcu_read_unlock(); + continue; + } + atomic_inc(&rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); + md_sync_acct(rdev->bdev, bio_sectors(tbio)); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &rdev->flags)) tbio->bi_opf |= MD_FAILFAST; - tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; - bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); + tbio->bi_iter.bi_sector += rdev->data_offset; + bio_set_dev(tbio, rdev->bdev); + rcu_read_unlock(); generic_make_request(tbio); } @@ -2222,6 +2243,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) struct r10conf *conf = mddev->private; int d; struct bio *wbio, *wbio2; + struct md_rdev *rdev, *replacement; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { fix_recovery_read_error(r10_bio); @@ -2236,6 +2258,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[1].devnum; wbio = r10_bio->devs[1].bio; wbio2 = r10_bio->devs[1].repl_bio; + + /* Need to test wbio2->bi_end_io before we call * generic_make_request as if the former is NULL, * the latter is free to free wbio2. @@ -2243,15 +2267,25 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (wbio2 && !wbio2->bi_end_io) wbio2 = NULL; if (wbio->bi_end_io) { - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); - generic_make_request(wbio); + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev) { + atomic_inc(&rdev->nr_pending); + md_sync_acct(rdev->bdev, bio_sectors(wbio)); + generic_make_request(wbio); + } + rcu_read_unlock(); } if (wbio2) { - atomic_inc(&conf->mirrors[d].replacement->nr_pending); - md_sync_acct(conf->mirrors[d].replacement->bdev, + rcu_read_lock(); + replacement = rcu_dereference(conf->mirrors[d].replacement); + if (replacement) { + atomic_inc(&replacement->nr_pending); + md_sync_acct(replacement->bdev, bio_sectors(wbio2)); - generic_make_request(wbio2); + generic_make_request(wbio2); + } + rcu_read_unlock(); } } @@ -2620,9 +2654,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) test_bit(R10BIO_IsRecover, &r10_bio->state)) { for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; - rdev = conf->mirrors[dev].rdev; - if (r10_bio->devs[m].bio == NULL) + + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[dev].rdev); + if (rdev == NULL) { + rcu_read_unlock(); + continue; + } + if (r10_bio->devs[m].bio == NULL) { + rcu_read_unlock(); continue; + } if (!r10_bio->devs[m].bio->bi_status) { rdev_clear_badblocks( rdev, @@ -2635,9 +2677,16 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->sectors, 0)) md_error(conf->mddev, rdev); } - rdev = conf->mirrors[dev].replacement; - if (r10_bio->devs[m].repl_bio == NULL) + + rdev = rcu_dereference(conf->mirrors[dev].replacement); + if (rdev == NULL) { + rcu_read_unlock(); continue; + } + if (r10_bio->devs[m].repl_bio == NULL) { + rcu_read_unlock(); + continue; + } if (!r10_bio->devs[m].repl_bio->bi_status) { rdev_clear_badblocks( @@ -2651,6 +2700,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->sectors, 0)) md_error(conf->mddev, rdev); } + rcu_read_unlock(); } put_buf(r10_bio); } else { @@ -2658,7 +2708,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; - rdev = conf->mirrors[dev].rdev; + + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[dev].rdev); + if (rdev == NULL) { + rcu_read_unlock(); + continue; + } if (bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, @@ -2675,14 +2731,19 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; - rdev = conf->mirrors[dev].replacement; - if (rdev && bio == IO_MADE_GOOD) { + rdev = rcu_dereference(conf->mirrors[dev].replacement); + if (rdev == NULL) { + rcu_read_unlock(); + continue; + } + if (bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } + rcu_read_unlock(); } if (fail) { spin_lock_irq(&conf->device_lock); -- 2.13.6 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html