From: Li Nan <linan122@xxxxxxxxxx> After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be replace old drive.") mirrors->replacement can replace rdev during repl_io pending. Replacement which write io issued to may have already become rdev when io completes, in this case, there are the following issues if we get rdev/replacement from mirror. 1. When replacement becomes rdev, target device of io is mirror->rdev, but bio is still r10bio->repl_bio. Changing 'repl' to 0 in raid10_end_write_request() will cause repl_bio to be put but not set to IO_MADE_GOOD, UAF will occur. The stack is shown at the end of patch. 2. After replacement replaces rdev, a new replacement is added. The replacement when issued io and the replacement when io ended are not the same, which can lead to some bugs, such as bad blocks set, nr_pending accounting, md_error rdev, and so on. In the previous patch, we recorded rdev/replacement in r10bio, directly getting target device of io from r10bio to fix these bugs. ================================================================== BUG: KASAN: use-after-free in bio_flagged include/linux/bio.h:238 [inline] BUG: KASAN: use-after-free in bio_put+0x78/0x80 block/bio.c:650 Read of size 2 at addr ffff888116524dd4 by task md0_raid10/2618 CPU: 0 PID: 2618 Comm: md0_raid10 Not tainted 5.10.0+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 sd 0:0:0:0: rejecting I/O to offline device Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x167 lib/dump_stack.c:118 print_address_description.constprop.0+0x1c/0x270 mm/kasan/report.c:390 __kasan_report mm/kasan/report.c:550 [inline] kasan_report.cold+0x22/0x3a mm/kasan/report.c:567 bio_flagged include/linux/bio.h:238 [inline] bio_put+0x78/0x80 block/bio.c:650 put_all_bios drivers/md/raid10.c:248 [inline] free_r10bio drivers/md/raid10.c:257 [inline] raid_end_bio_io+0x3b5/0x590 drivers/md/raid10.c:309 handle_write_completed drivers/md/raid10.c:2699 [inline] raid10d+0x2f85/0x5af0 drivers/md/raid10.c:2759 md_thread+0x444/0x4b0 drivers/md/md.c:7932 kthread+0x38c/0x470 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299 Allocated by task 1400: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] set_alloc_info mm/kasan/common.c:498 [inline] __kasan_kmalloc.constprop.0+0xb5/0xe0 mm/kasan/common.c:530 slab_post_alloc_hook mm/slab.h:512 [inline] slab_alloc_node mm/slub.c:2923 [inline] slab_alloc mm/slub.c:2931 [inline] kmem_cache_alloc+0x144/0x360 mm/slub.c:2936 mempool_alloc+0x146/0x360 mm/mempool.c:391 bio_alloc_bioset+0x375/0x610 block/bio.c:486 bio_clone_fast+0x20/0x50 block/bio.c:711 raid10_write_one_disk+0x166/0xd30 drivers/md/raid10.c:1240 raid10_write_request+0x1600/0x2c90 drivers/md/raid10.c:1484 __make_request drivers/md/raid10.c:1508 [inline] raid10_make_request+0x376/0x620 drivers/md/raid10.c:1537 md_handle_request+0x699/0x970 drivers/md/md.c:451 md_submit_bio+0x204/0x400 drivers/md/md.c:489 __submit_bio block/blk-core.c:959 [inline] __submit_bio_noacct block/blk-core.c:1007 [inline] submit_bio_noacct+0x2e3/0xcf0 block/blk-core.c:1086 submit_bio+0x1a0/0x3a0 block/blk-core.c:1146 submit_bh_wbc+0x685/0x8e0 fs/buffer.c:3053 ext4_commit_super+0x37e/0x6c0 fs/ext4/super.c:5696 flush_stashed_error_work+0x28b/0x400 fs/ext4/super.c:791 process_one_work+0x9a6/0x1590 kernel/workqueue.c:2280 worker_thread+0x61d/0x1310 kernel/workqueue.c:2426 kthread+0x38c/0x470 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299 Freed by task 2618: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:361 __kasan_slab_free+0x151/0x180 mm/kasan/common.c:482 slab_free_hook mm/slub.c:1569 [inline] slab_free_freelist_hook+0xa9/0x180 mm/slub.c:1608 slab_free mm/slub.c:3179 [inline] kmem_cache_free+0xcd/0x3d0 mm/slub.c:3196 mempool_free+0xe3/0x3b0 mm/mempool.c:500 bio_free+0xe2/0x140 block/bio.c:266 bio_put+0x58/0x80 block/bio.c:651 raid10_end_write_request+0x885/0xb60 drivers/md/raid10.c:516 bio_endio+0x376/0x6a0 block/bio.c:1465 req_bio_endio block/blk-core.c:289 [inline] blk_update_request+0x5f5/0xf40 block/blk-core.c:1525 blk_mq_end_request+0x4c/0x510 block/blk-mq.c:654 blk_flush_complete_seq+0x835/0xd80 block/blk-flush.c:204 flush_end_io+0x7b7/0xb90 block/blk-flush.c:261 __blk_mq_end_request+0x282/0x4c0 block/blk-mq.c:645 scsi_end_request+0x3a8/0x850 drivers/scsi/scsi_lib.c:607 scsi_io_completion+0x3f5/0x1320 drivers/scsi/scsi_lib.c:970 scsi_softirq_done+0x11b/0x490 drivers/scsi/scsi_lib.c:1448 blk_mq_complete_request block/blk-mq.c:788 [inline] blk_mq_complete_request+0x84/0xb0 block/blk-mq.c:785 scsi_mq_done+0x155/0x360 drivers/scsi/scsi_lib.c:1603 virtscsi_vq_done drivers/scsi/virtio_scsi.c:184 [inline] virtscsi_req_done+0x14c/0x220 drivers/scsi/virtio_scsi.c:199 vring_interrupt drivers/virtio/virtio_ring.c:2061 [inline] vring_interrupt+0x27a/0x300 drivers/virtio/virtio_ring.c:2047 __handle_irq_event_percpu+0x2f8/0x830 kernel/irq/handle.c:156 handle_irq_event_percpu kernel/irq/handle.c:196 [inline] handle_irq_event+0x105/0x280 kernel/irq/handle.c:213 handle_edge_irq+0x258/0xd20 kernel/irq/chip.c:828 asm_call_irq_on_stack+0xf/0x20 __run_irq_on_irqstack arch/x86/include/asm/irq_stack.h:48 [inline] run_irq_on_irqstack_cond arch/x86/include/asm/irq_stack.h:101 [inline] handle_irq arch/x86/kernel/irq.c:230 [inline] __common_interrupt arch/x86/kernel/irq.c:249 [inline] common_interrupt+0xe2/0x190 arch/x86/kernel/irq.c:239 asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:626 Fixes: 4ca40c2ce099 ("md/raid10: Allow replacement device to be replace old drive.") Signed-off-by: Li Nan <linan122@xxxxxxxxxx> --- drivers/md/raid10.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4fcfcb350d2b..56206cbd169d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -468,17 +468,15 @@ static void raid10_end_write_request(struct bio *bio) dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - smp_rmb(); - repl = 0; - rdev = conf->mirrors[dev].rdev; - } + rdev = r10_bio->devs[slot].replacement; + else + rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: */ if (bio->bi_status && !discard_error) { - if (repl) + /* replacement may have replaced rdev */ + if (repl && rdev == conf->mirrors[dev].replacement) /* Never record new bad blocks to replacement, * just fail it. */ @@ -504,7 +502,10 @@ static void raid10_end_write_request(struct bio *bio) else { /* Fail the request */ set_bit(R10BIO_Degraded, &r10_bio->state); - r10_bio->devs[slot].bio = NULL; + if (repl) + r10_bio->devs[slot].repl_bio = NULL; + else + r10_bio->devs[slot].bio = NULL; to_put = bio; dec_rdev = 1; } @@ -1289,15 +1290,10 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, int devnum = r10_bio->devs[n_copy].devnum; struct bio *mbio; - if (replacement) { - rdev = conf->mirrors[devnum].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[devnum].rdev; - } - } else - rdev = conf->mirrors[devnum].rdev; + if (replacement) + rdev = r10_bio->devs[n_copy].replacement; + else + rdev = r10_bio->devs[n_copy].rdev; mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); if (replacement) @@ -1309,8 +1305,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; mbio->bi_opf = op | do_sync | do_fua; - if (!replacement && test_bit(FailFast, - &conf->mirrors[devnum].rdev->flags) + if (!replacement && test_bit(FailFast, &rdev->flags) && enough(conf, devnum)) mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r10_bio; @@ -1530,10 +1525,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } if (rdev) { r10_bio->devs[i].bio = bio; + r10_bio->devs[i].rdev = rdev; atomic_inc(&rdev->nr_pending); } if (rrdev) { r10_bio->devs[i].repl_bio = bio; + r10_bio->devs[i].replacement = rrdev; atomic_inc(&rrdev->nr_pending); } } @@ -3051,9 +3048,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } else { bool fail = false; for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; - rdev = conf->mirrors[dev].rdev; + rdev = r10_bio->devs[m].rdev; if (bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, @@ -3070,7 +3066,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; - rdev = conf->mirrors[dev].replacement; + rdev = r10_bio->devs[m].replacement; if (rdev && bio == IO_MADE_GOOD) { rdev_clear_badblocks( rdev, -- 2.39.2