If a RAID10 rdev that is undergoing recovery is marked 'faulty', the rdev could get taken out of the array in spite of outstanding IOs leading to a kernel panic. There are two issues here - 1. The ref count (nr_pending) increment for sync or recovery leaves a lot of open windows for concurrent rdev removals 2. raid10 sync thread continues to submit recovery IOs to faulty devices. These get rejected at a later stage by management thread (raid10d). Note - rd denotes the rdev from which we are reading, and wr the one we are writing to Sync Thread Management Thread sync_request ++rd.nr_pending bi_end_io = end_sync_read generic_make_request -------> recovery_request_write | | wr.nr_pending++ | | bi_end_io = end_sync_write V | generic_make_request end_sync_read -------------- | --rd.nr_pending | reschedule_retry for write | v end_sync_write --wr.nr_pending So a set-faulty and remove on recovery rdev between sync_request and recovery_request_write is allowed and will lead to a panic. The fix is - 1. Increment wr.nr_pending immediately after selecting a good target. Ofcourse the decrements will be added to error paths in sync_request and end_sync_read. 2. Don't submit recovery IOs to faulty targets Signed-off-by: Aniket Kulkarni <aniket@xxxxxxxxxxx> --- drivers/md/raid10.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 43 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c67aa54..ec1ea43 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1408,6 +1408,16 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) done: if (atomic_dec_and_test(&r10_bio->remaining)) { + for (i = 0; i < conf->copies ; i++) { + /* for any unsuccessful IOs give up the pending count on the + * write device + */ + if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags) && + r10_bio->devs[i].bio->bi_end_io == end_sync_write) { + int d = r10_bio->devs[i].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + } md_done_sync(mddev, r10_bio->sectors, 1); put_buf(r10_bio); } @@ -1443,7 +1453,6 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) } d = r10_bio->devs[1].devnum; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); if (test_bit(R10BIO_Uptodate, &r10_bio->state)) generic_make_request(wbio); @@ -1906,14 +1915,24 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int j, k; r10_bio = NULL; - for (i=0 ; i<conf->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->mirrors[i].rdev && !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { int still_degraded = 0; /* want to reconstruct this device */ r10bio_t *rb2 = r10_bio; - sector_t sect = raid10_find_virt(conf, sector_nr, i); + sector_t sect; int must_sync; + + /* Skip over the faulty drives */ + if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + continue; + + /* up the nr_pending count; this will be decremented on + * sync/recovery IO completion (success/erorr) to this dev + */ + atomic_inc(&conf->mirrors[i].rdev->nr_pending); + sect = raid10_find_virt(conf, sector_nr, i); /* Unless we are doing a full sync, we only need * to recover the block if it is set in the bitmap */ @@ -1927,6 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i * that there will never be anything to do here */ chunks_skipped = -1; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); continue; } @@ -1997,6 +2017,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (j == conf->copies) { /* Cannot recover, so abort the recovery */ put_buf(r10_bio); + /* we wanted to write to 'i' but we didn't; so dec the + * pending count + */ + rdev_dec_pending(conf->mirrors[i].rdev, mddev); if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; @@ -2014,6 +2038,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio = (r10bio_t*) rb2->master_bio; rb2->master_bio = NULL; put_buf(rb2); + if (r10_bio) { + int d; + /* Before throwing away the complete r10bio chain, decrement + * the nr_pending ref counts incremented along the way + * We do this only for our master_bio and then on because + * if biolist == NULL no ref count was incremented in this + * r10_bio + */ + for (d = 0; d < conf->copies; d++) { + if (r10_bio->devs[d].bio && + r10_bio->devs[d].bio->bi_end_io) { + int dn = r10_bio->devs[d].devnum; + rdev_dec_pending(conf->mirrors[dn].rdev, mddev); + } + } + } } goto giveup; } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html