On Mar 25, 2021 / 19:29, Bart Van Assche wrote: > When multiple request queues share a tag set and when switching the I/O > scheduler for one of the request queues associated with that tag set, the > following race can happen: > * blk_mq_tagset_busy_iter() calls bt_tags_iter() and bt_tags_iter() assigns > a pointer to a scheduler request to the local variable 'rq'. > * blk_mq_sched_free_requests() is called to free hctx->sched_tags. > * blk_mq_tagset_busy_iter() dereferences 'rq' and triggers a use-after-free. > > Fix this race as follows: > * Use rcu_assign_pointer() and rcu_dereference() to access hctx->tags->rqs[]. > The performance impact of the assignments added to the hot path is minimal > (about 1% according to one particular test). > * Protect hctx->tags->rqs[] reads with an RCU read-side lock or with a > semaphore. Which mechanism is used depends on whether or not it is allowed > to sleep and also on whether or not the callback function may sleep. > * Wait for all preexisting readers to finish before freeing scheduler > requests. > > Multiple users have reported use-after-free complaints similar to the > following (from https://lore.kernel.org/linux-block/1545261885.185366.488.camel@xxxxxxx/): > > BUG: KASAN: use-after-free in bt_iter+0x86/0xf0 > Read of size 8 at addr ffff88803b335240 by task fio/21412 > > CPU: 0 PID: 21412 Comm: fio Tainted: G W 4.20.0-rc6-dbg+ #3 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 > Call Trace: > dump_stack+0x86/0xca > print_address_description+0x71/0x239 > kasan_report.cold.5+0x242/0x301 > __asan_load8+0x54/0x90 > bt_iter+0x86/0xf0 > blk_mq_queue_tag_busy_iter+0x373/0x5e0 > blk_mq_in_flight+0x96/0xb0 > part_in_flight+0x40/0x140 > part_round_stats+0x18e/0x370 > blk_account_io_start+0x3d7/0x670 > blk_mq_bio_to_request+0x19c/0x3a0 > blk_mq_make_request+0x7a9/0xcb0 > generic_make_request+0x41d/0x960 > submit_bio+0x9b/0x250 > do_blockdev_direct_IO+0x435c/0x4c70 > __blockdev_direct_IO+0x79/0x88 > ext4_direct_IO+0x46c/0xc00 > generic_file_direct_write+0x119/0x210 > __generic_file_write_iter+0x11c/0x280 > ext4_file_write_iter+0x1b8/0x6f0 > aio_write+0x204/0x310 > io_submit_one+0x9d3/0xe80 > __x64_sys_io_submit+0x115/0x340 > do_syscall_64+0x71/0x210 > > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Ming Lei <ming.lei@xxxxxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxx> > Cc: Johannes Thumshirn <johannes.thumshirn@xxxxxxx> > Cc: John Garry <john.garry@xxxxxxxxxx> > Cc: Khazhy Kumykov <khazhy@xxxxxxxxxx> > Cc: Shin'ichiro Kawasaki <shinichiro.kawasaki@xxxxxxx> > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > block/blk-core.c | 34 +++++++++++++++++++++++++++++++++- > block/blk-mq-tag.c | 43 +++++++++++++++++++++++++++++++++++++++---- > block/blk-mq-tag.h | 4 +++- > block/blk-mq.c | 21 +++++++++++++++++---- > block/blk-mq.h | 1 + > block/blk.h | 2 ++ > block/elevator.c | 1 + > 7 files changed, 96 insertions(+), 10 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index fc60ff208497..fabb45ecd216 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -279,6 +279,36 @@ void blk_dump_rq_flags(struct request *rq, char *msg) > } > EXPORT_SYMBOL(blk_dump_rq_flags); > > +/** > + * blk_mq_wait_for_tag_readers - wait for preexisting tag readers to finish > + * @set: Tag set to wait on. > + * > + * Waits for preexisting __blk_mq_all_tag_iter() calls to finish accessing > + * hctx->tags->rqs[]. New readers may start while this function is in progress > + * or after this function has returned. Use this function to make sure that > + * hctx->tags->rqs[] changes have become globally visible. > + * > + * Accesses of hctx->tags->rqs[] by blk_mq_queue_tag_busy_iter() calls are out > + * of scope for this function. The caller can pause blk_mq_queue_tag_busy_iter() > + * calls for a request queue by freezing that request queue. > + */ > +void blk_mq_wait_for_tag_readers(struct blk_mq_tag_set *set) > +{ > + struct blk_mq_tags *tags; > + int i; > + > + if (set->tags) { > + for (i = 0; i < set->nr_hw_queues; i++) { > + tags = set->tags[i]; > + if (!tags) > + continue; > + down_write(&tags->iter_rwsem); > + up_write(&tags->iter_rwsem); > + } > + } > + synchronize_rcu(); > +} > + > /** > * blk_sync_queue - cancel any pending callbacks on a queue > * @q: the queue > @@ -412,8 +442,10 @@ void blk_cleanup_queue(struct request_queue *q) > * it is safe to free requests now. > */ > mutex_lock(&q->sysfs_lock); > - if (q->elevator) > + if (q->elevator) { > + blk_mq_wait_for_tag_readers(q->tag_set); > blk_mq_sched_free_requests(q); > + } > mutex_unlock(&q->sysfs_lock); > > percpu_ref_exit(&q->q_usage_counter); > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 975626f755c2..c8722ce7c35c 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -209,7 +209,12 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) > > if (!reserved) > bitnr += tags->nr_reserved_tags; > - rq = tags->rqs[bitnr]; > + /* > + * See also the percpu_ref_tryget() and blk_queue_exit() calls in > + * blk_mq_queue_tag_busy_iter(). > + */ > + rq = rcu_dereference_check(tags->rqs[bitnr], > + !percpu_ref_is_zero(&hctx->queue->q_usage_counter)); > > /* > * We can hit rq == NULL here, because the tagging functions > @@ -254,11 +259,17 @@ struct bt_tags_iter_data { > unsigned int flags; > }; > > +/* Include reserved tags. */ > #define BT_TAG_ITER_RESERVED (1 << 0) > +/* Only include started requests. */ > #define BT_TAG_ITER_STARTED (1 << 1) > +/* Iterate over tags->static_rqs[] instead of tags->rqs[]. */ > #define BT_TAG_ITER_STATIC_RQS (1 << 2) > +/* The callback function may sleep. */ > +#define BT_TAG_ITER_MAY_SLEEP (1 << 3) > > -static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) > +static bool __bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, > + void *data) > { > struct bt_tags_iter_data *iter_data = data; > struct blk_mq_tags *tags = iter_data->tags; > @@ -275,7 +286,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) > if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) > rq = tags->static_rqs[bitnr]; > else > - rq = tags->rqs[bitnr]; > + rq = rcu_dereference_check(tags->rqs[bitnr], true); > if (!rq) > return true; > if ((iter_data->flags & BT_TAG_ITER_STARTED) && > @@ -284,6 +295,25 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) > return iter_data->fn(rq, iter_data->data, reserved); > } > > +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) > +{ > + struct bt_tags_iter_data *iter_data = data; > + struct blk_mq_tags *tags = iter_data->tags; > + bool res; > + > + if (iter_data->flags & BT_TAG_ITER_MAY_SLEEP) { > + down_read(&tags->iter_rwsem); > + res = __bt_tags_iter(bitmap, bitnr, data); > + up_read(&tags->iter_rwsem); > + } else { > + rcu_read_lock(); > + res = __bt_tags_iter(bitmap, bitnr, data); > + rcu_read_unlock(); > + } > + > + return res; > +} > + > /** > * bt_tags_for_each - iterate over the requests in a tag map > * @tags: Tag map to iterate over. > @@ -357,10 +387,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, > { > int i; > > + might_sleep(); > + > for (i = 0; i < tagset->nr_hw_queues; i++) { > if (tagset->tags && tagset->tags[i]) > __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, > - BT_TAG_ITER_STARTED); > + BT_TAG_ITER_STARTED | BT_TAG_ITER_MAY_SLEEP); > } > } > EXPORT_SYMBOL(blk_mq_tagset_busy_iter); > @@ -554,6 +586,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, > kfree(tags); > return NULL; > } > + > + init_rwsem(&tags->iter_rwsem); > + > return tags; > } > Hi Bart, thank you for your effort on this problem. I applied this series on v5.12-rc4 and ran blktests block/005 with a HDD behind SAS-HBA, and I observed kernel INFO and WARNING [1]. It looks like that tags->iter_rwsem is not initialized. I found blk_mq_init_tags() has two paths to "return tags". I think when blk_mq_init_tags() returns at the first path to "return tags", tags->iter_rwsem misses the initialization. To confirm it, I moved the init_rwsem() before the first "return tags", then saw the kernel messages disappeared (use-after-free disappeared also). Could you relook the hunk? [1] [ 80.079549] run blktests block/005 at 2021-03-26 17:21:49 [ 80.183881] INFO: trying to register non-static key. [ 80.189545] the code is fine but needs lockdep annotation. [ 80.195725] turning off the locking correctness validator. [ 80.201906] CPU: 3 PID: 1229 Comm: check Not tainted 5.12.0-rc4+ #8 [ 80.208863] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0 12/17/2015 [ 80.216949] Call Trace: [ 80.220091] dump_stack+0x93/0xc2 [ 80.224111] register_lock_class+0x1b58/0x1b60 [ 80.229248] ? lock_chain_count+0x20/0x20 [ 80.233953] ? __kasan_slab_free+0xe5/0x110 [ 80.238833] ? slab_free_freelist_hook+0x4b/0x140 [ 80.244231] ? kmem_cache_free+0xf4/0x5b0 [ 80.248937] ? is_dynamic_key+0x1b0/0x1b0 [ 80.253643] ? mark_lock+0xe4/0x2fd0 [ 80.257918] ? queue_attr_store+0x8b/0xd0 [ 80.262625] ? kernfs_fop_write_iter+0x2cb/0x460 [ 80.267937] ? new_sync_write+0x355/0x5e0 [ 80.272642] ? vfs_write+0x5b2/0x860 [ 80.276917] ? ksys_write+0xe9/0x1b0 [ 80.281190] __lock_acquire+0xe3/0x58b0 [ 80.285725] ? __lock_acquire+0xbb0/0x58b0 [ 80.290514] ? lockdep_hardirqs_on_prepare+0x3e0/0x3e0 [ 80.296348] lock_acquire+0x181/0x6a0 [ 80.300707] ? blk_mq_wait_for_tag_readers+0xdd/0x150 [ 80.306453] ? lock_release+0x680/0x680 [ 80.310983] ? find_held_lock+0x2c/0x110 [ 80.315604] ? lock_is_held_type+0x98/0x110 [ 80.320484] down_write+0x84/0x130 [ 80.324586] ? blk_mq_wait_for_tag_readers+0xdd/0x150 [ 80.330330] ? down_write_killable_nested+0x150/0x150 [ 80.336076] blk_mq_wait_for_tag_readers+0xdd/0x150 [ 80.341650] elevator_exit+0x75/0xe0 [ 80.345924] elevator_switch_mq+0xda/0x4d0 [ 80.350717] elevator_switch+0x5d/0xa0 [ 80.355161] elv_iosched_store+0x31b/0x3c0 [ 80.359956] ? elevator_init_mq+0x350/0x350 [ 80.364834] ? lock_is_held_type+0x98/0x110 [ 80.369714] queue_attr_store+0x8b/0xd0 [ 80.374248] ? sysfs_file_ops+0x170/0x170 [ 80.378951] kernfs_fop_write_iter+0x2cb/0x460 [ 80.384094] new_sync_write+0x355/0x5e0 [ 80.388627] ? new_sync_read+0x5d0/0x5d0 [ 80.393243] ? ksys_write+0xe9/0x1b0 [ 80.397517] ? lock_release+0x680/0x680 [ 80.402047] ? __cond_resched+0x15/0x30 [ 80.406580] ? inode_security+0x56/0xf0 [ 80.411115] ? lock_is_held_type+0x98/0x110 [ 80.415999] vfs_write+0x5b2/0x860 [ 80.420096] ksys_write+0xe9/0x1b0 [ 80.424193] ? __ia32_sys_read+0xb0/0xb0 [ 80.428811] ? syscall_enter_from_user_mode+0x27/0x70 [ 80.434556] ? trace_hardirqs_on+0x1c/0x110 [ 80.439438] do_syscall_64+0x33/0x40 [ 80.443709] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 80.449458] RIP: 0033:0x7fad1b2794e7 [ 80.453733] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 [ 80.473169] RSP: 002b:00007ffd987bdc38 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 80.481429] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fad1b2794e7 [ 80.489253] RDX: 000000000000000c RSI: 000055a049b83d30 RDI: 0000000000000001 [ 80.497071] RBP: 000055a049b83d30 R08: 000000000000000a R09: 00007fad1b3100c0 [ 80.504890] R10: 00007fad1b30ffc0 R11: 0000000000000246 R12: 000000000000000c [ 80.512715] R13: 00007fad1b34c520 R14: 000000000000000c R15: 00007fad1b34c720 [ 80.520576] ------------[ cut here ]------------ [ 80.525896] DEBUG_RWSEMS_WARN_ON(sem->magic != sem): count = 0x1, magic = 0x0, owner = 0xffff8881251cb240, curr 0xffff8881251cb240, list not empty [ 80.539706] WARNING: CPU: 3 PID: 1229 at kernel/locking/rwsem.c:1311 up_write+0x365/0x530 [ 80.548579] Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tftp nf_conntrack_tftp tun bridge stp llc nft_objref nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_tables ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw iptable_security ip_set nfnetlink rfkill target_core_user ebtable_filter ebtables target_core_mod ip6table_filter ip6_tables iptable_filter sunrpc intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass rapl intel_cstate iTCO_wdt intel_uncore intel_pmc_bxt pcspkr iTCO_vendor_support joydev i2c_i801 i2c_smbus lpc_ich ses enclosure mei_me ipmi_ssif mei ioatdma wmi acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad [ 80.548690] zram ip_tables ast drm_vram_helper drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec drm_ttm_helper ttm crct10dif_pclmul crc32_pclmul crc32c_intel drm ghash_clmulni_intel igb mpt3sas dca i2c_algo_bit xhci_pci raid_class xhci_pci_renesas scsi_transport_sas fuse [ 80.661295] CPU: 3 PID: 1229 Comm: check Not tainted 5.12.0-rc4+ #8 [ 80.668258] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0 12/17/2015 [ 80.676345] RIP: 0010:up_write+0x365/0x530 [ 80.681155] Code: 02 00 0f 85 73 01 00 00 ff 34 24 48 8b 55 00 4d 89 f0 48 c7 c6 c0 45 69 a0 4c 8b 4c 24 10 48 c7 c7 00 46 69 a0 e8 65 e2 e2 01 <0f> 0b 59 e9 c3 fe ff ff 4c 8d 75 58 c6 05 cd ef 67 03 01 48 b8 00 [ 80.700595] RSP: 0018:ffff888125affb08 EFLAGS: 00010286 [ 80.706513] RAX: 0000000000000000 RBX: 1ffff11024b5ff65 RCX: 0000000000000000 [ 80.714339] RDX: 0000000000000004 RSI: 0000000000000008 RDI: ffffed1024b5ff57 [ 80.722169] RBP: ffff888144da20c0 R08: 0000000000000001 R09: ffff8888114ee4a7 [ 80.730000] R10: ffffed110229dc94 R11: 0000000000000001 R12: ffff888144da20c8 [ 80.737826] R13: ffff888144da2128 R14: ffff8881251cb240 R15: ffffffffa19a5668 [ 80.745653] FS: 00007fad1b184740(0000) GS:ffff8888114c0000(0000) knlGS:0000000000000000 [ 80.754432] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 80.760872] CR2: 00007fad1b34c5b0 CR3: 000000015e552003 CR4: 00000000001706e0 [ 80.768700] Call Trace: [ 80.771849] ? downgrade_write+0x370/0x370 [ 80.776645] ? down_write_killable_nested+0x150/0x150 [ 80.782393] blk_mq_wait_for_tag_readers+0xe5/0x150 [ 80.787977] elevator_exit+0x75/0xe0 [ 80.792255] elevator_switch_mq+0xda/0x4d0 [ 80.797051] elevator_switch+0x5d/0xa0 [ 80.801503] elv_iosched_store+0x31b/0x3c0 [ 80.806295] ? elevator_init_mq+0x350/0x350 [ 80.811177] ? lock_is_held_type+0x98/0x110 [ 80.816064] queue_attr_store+0x8b/0xd0 [ 80.820605] ? sysfs_file_ops+0x170/0x170 [ 80.825310] kernfs_fop_write_iter+0x2cb/0x460 [ 80.830452] new_sync_write+0x355/0x5e0 [ 80.834994] ? new_sync_read+0x5d0/0x5d0 [ 80.839621] ? ksys_write+0xe9/0x1b0 [ 80.843893] ? lock_release+0x680/0x680 [ 80.848427] ? __cond_resched+0x15/0x30 [ 80.852967] ? inode_security+0x56/0xf0 [ 80.857510] ? lock_is_held_type+0x98/0x110 [ 80.862398] vfs_write+0x5b2/0x860 [ 80.866497] ksys_write+0xe9/0x1b0 [ 80.870596] ? __ia32_sys_read+0xb0/0xb0 [ 80.875214] ? syscall_enter_from_user_mode+0x27/0x70 [ 80.880962] ? trace_hardirqs_on+0x1c/0x110 [ 80.885850] do_syscall_64+0x33/0x40 [ 80.890132] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 80.895886] RIP: 0033:0x7fad1b2794e7 [ 80.900160] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 [ 80.919598] RSP: 002b:00007ffd987bdc38 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 80.927856] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fad1b2794e7 [ 80.935684] RDX: 000000000000000c RSI: 000055a049b83d30 RDI: 0000000000000001 [ 80.943512] RBP: 000055a049b83d30 R08: 000000000000000a R09: 00007fad1b3100c0 [ 80.951335] R10: 00007fad1b30ffc0 R11: 0000000000000246 R12: 000000000000000c [ 80.959163] R13: 00007fad1b34c520 R14: 000000000000000c R15: 00007fad1b34c720 [ 80.966993] irq event stamp: 34619 [ 80.971096] hardirqs last enabled at (34619): [<ffffffffa01e3044>] _raw_spin_unlock_irq+0x24/0x30 [ 80.980753] hardirqs last disabled at (34618): [<ffffffffa01e2e33>] _raw_spin_lock_irq+0x43/0x50 [ 80.990233] softirqs last enabled at (34494): [<ffffffff9e1af289>] __irq_exit_rcu+0x1b9/0x270 [ 80.999542] softirqs last disabled at (34487): [<ffffffff9e1af289>] __irq_exit_rcu+0x1b9/0x270 [ 81.008841] ---[ end trace c154275a45048f81 ]--- -- Best Regards, Shin'ichiro Kawasaki