Hi,
在 2024/05/22 16:23, Li Nan 写道:
Hi, Yi Zhang
在 2024/5/22 12:40, Yi Zhang 写道:
Hello
With Sagi's new blktests case scenario[1], I tried the concurrent
power on/off for null-blk[2] and found it will lead to kernel
panic[3], please help check it and let me know if you need any
info/testing for it, thanks.
BTW, I will submit one blktests case for it later.
[1]
https://lore.kernel.org/linux-nvme/20240521085623.87681-1-sagi@xxxxxxxxxxx/
[2]Reproducer:
nullb1="/sys/kernel/config/nullb/nullb1"
modprobe null-blk nr_devices=0
mkdir $nullb1
echo 1024 > $nullb1/size
echo 1 > $nullb1/memory_backed
null_blk_power_loop() {
local iterations=10
for ((i = 1; i <= ${iterations}; i++)); do
echo 0 > $nullb1/power
echo 1 > $nullb1/power
done
}
null_blk_power_loop &
null_blk_power_loop &
I tried to fix this:
https://lore.kernel.org/all/20230610074618.2673673-1-yukuai1@xxxxxxxxxxxxxxx/
Hi, Yi, can you give the attached patch a test? I just rebase it with
block/for-next branch.
Thanks,
Kuai
Concurrent creation and deletion of null_blk processes can lead to this
issue. Consider the following scenario:
T1 T2
echo 0 > power
null_del_dev
free nullb
echo 1 > power
null_add_dev
dev->nullb = nullb
...
dev->nullb = NULL
T2 creates a new device and sets dev->nullb, but T1 sets dev->nullb to NULL
later. after that, anyone trying to access dev->nullb will trigger NULL
pointer dereference.
I will fix it soon. Thanks for your report.
wait
[3]
[ 1200.017939] null_blk: disk nullb1 created
[ 1200.043860] BUG: kernel NULL pointer dereference, address:
0000000000000130
[ 1200.051627] #PF: supervisor write access in kernel mode
[ 1200.057458] #PF: error_code(0x0002) - not-present page
[ 1200.063192] PGD 0 P4D 0
[ 1200.066021] Oops: 0002 [#1] PREEMPT SMP PTI
[ 1200.070691] CPU: 0 PID: 1724 Comm: sh Not tainted
6.9.0-64.eln136.x86_64 #1
[ 1200.078462] Hardware name: Dell Inc. PowerEdge R730xd/ɲ�Pow, BIOS
2.19.0 12/12/2023
[ 1200.087105] RIP: 0010:_raw_spin_lock_irq+0x18/0x30
[ 1200.092459] Code: 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
90 90 f3 0f 1e fa 0f 1f 44 00 00 fa 65 ff 05 47 28 26 7c 31 c0 ba 01
00 00 00 <f0> 0f b1 17 75 05 c3 cc cc cc cc 89 c6 e8 46 01 00 00 90 c3
cc cc
[ 1200.113416] RSP: 0018:ffffb973431e7420 EFLAGS: 00010046
[ 1200.119249] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
0000000000000008
[ 1200.127212] RDX: 0000000000000001 RSI: 0000000000000000 RDI:
0000000000000130
[ 1200.135175] RBP: 0000000000000008 R08: ffff9645c44e4400 R09:
0000000000000000
[ 1200.143140] R10: 0000000000000014 R11: 0000000000000000 R12:
ffff964743a2d800
[ 1200.151103] R13: 0000000000000130 R14: ffff9645d05580f8 R15:
0000000000000001
[ 1200.159067] FS: 00007f67cea3b740(0000) GS:ffff964737a00000(0000)
knlGS:0000000000000000
[ 1200.168099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1200.174511] CR2: 0000000000000130 CR3: 00000002819c4001 CR4:
00000000001706f0
[ 1200.182473] Call Trace:
[ 1200.185200] <TASK>
[ 1200.187541] ? show_trace_log_lvl+0x1b0/0x2f0
[ 1200.192406] ? show_trace_log_lvl+0x1b0/0x2f0
[ 1200.197272] ? null_handle_rq+0x3f/0x500 [null_blk]
[ 1200.202735] ? __die_body.cold+0x8/0x12
[ 1200.207008] ? page_fault_oops+0x146/0x160
[ 1200.211583] ? exc_page_fault+0x73/0x160
[ 1200.215963] ? asm_exc_page_fault+0x26/0x30
[ 1200.220634] ? _raw_spin_lock_irq+0x18/0x30
[ 1200.225303] null_handle_rq+0x3f/0x500 [null_blk]
[ 1200.230567] ? pcpu_alloc+0x369/0x900
[ 1200.234654] ? lock_timer_base+0x76/0xa0
[ 1200.239035] null_process_cmd+0xb4/0x100 [null_blk]
[ 1200.244490] null_handle_cmd+0x36/0x130 [null_blk]
[ 1200.249849] null_queue_rq+0x130/0x200 [null_blk]
[ 1200.255110] ? __pfx_autoremove_wake_function+0x10/0x10
[ 1200.260948] __blk_mq_issue_directly+0x4b/0xc0
[ 1200.265911] blk_mq_try_issue_directly+0x88/0x110
[ 1200.271155] blk_mq_submit_bio+0x596/0x690
[ 1200.275720] submit_bio_noacct_nocheck+0x162/0x240
[ 1200.281071] ? submit_bio_noacct+0x24/0x540
[ 1200.285740] block_read_full_folio+0x1f8/0x300
[ 1200.290702] ? __pfx_blkdev_get_block+0x10/0x10
[ 1200.295760] ? __pfx_blkdev_read_folio+0x10/0x10
[ 1200.300913] ? __pfx_blkdev_read_folio+0x10/0x10
[ 1200.306064] filemap_read_folio+0x43/0xe0
[ 1200.310542] ? __pfx_blkdev_read_folio+0x10/0x10
[ 1200.315693] do_read_cache_folio+0x7c/0x190
[ 1200.320365] read_part_sector+0x33/0xb0
[ 1200.324650] read_lba+0xc1/0x180
[ 1200.328256] find_valid_gpt.constprop.0+0xe1/0x520
[ 1200.333609] efi_partition+0x7c/0x3a0
[ 1200.337699] ? snprintf+0x53/0x70
[ 1200.341401] ? __pfx_efi_partition+0x10/0x10
[ 1200.346170] check_partition+0x101/0x1c0
[ 1200.350550] bdev_disk_changed+0x193/0x330
[ 1200.355121] ? security_file_alloc+0x61/0xf0
[ 1200.359889] blkdev_get_whole+0x5f/0xa0
[ 1200.364170] bdev_open+0x205/0x3c0
[ 1200.367966] bdev_file_open_by_dev+0xbd/0x110
[ 1200.372829] disk_scan_partitions+0x6e/0x100
[ 1200.377594] device_add_disk+0x3bb/0x3c0
[ 1200.381972] null_add_dev+0x479/0x650 [null_blk]
[ 1200.387139] nullb_device_power_store+0x7c/0x120 [null_blk]
[ 1200.393370] configfs_write_iter+0xbc/0x120
[ 1200.398042] vfs_write+0x296/0x460
[ 1200.401841] ksys_write+0x6d/0xf0
[ 1200.405543] do_syscall_64+0x7e/0x160
[ 1200.409634] ? syscall_exit_work+0xf3/0x120
[ 1200.414302] ? syscall_exit_to_user_mode+0x71/0x1f0
[ 1200.419740] ? do_syscall_64+0x8a/0x160
[ 1200.424019] ? syscall_exit_work+0xf3/0x120
[ 1200.428686] ? syscall_exit_to_user_mode+0x71/0x1f0
[ 1200.434131] ? do_syscall_64+0x8a/0x160
[ 1200.438413] ? syscall_exit_work+0xf3/0x120
[ 1200.443081] ? syscall_exit_to_user_mode+0x71/0x1f0
[ 1200.448525] ? do_syscall_64+0x8a/0x160
[ 1200.452807] ? syscall_exit_work+0xf3/0x120
[ 1200.457475] ? syscall_exit_to_user_mode+0x71/0x1f0
[ 1200.462921] ? do_syscall_64+0x8a/0x160
[ 1200.467202] ? syscall_exit_to_user_mode+0x71/0x1f0
[ 1200.472638] ? do_syscall_64+0x8a/0x160
[ 1200.476918] ? exc_page_fault+0x73/0x160
[ 1200.481296] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 1200.486933] RIP: 0033:0x7f67ce8fda57
[ 1200.490931] Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7
0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89
74 24
[ 1200.511886] RSP: 002b:00007ffd30172638 EFLAGS: 00000246 ORIG_RAX:
0000000000000001
[ 1200.520335] RAX: ffffffffffffffda RBX: 0000000000000002 RCX:
00007f67ce8fda57
[ 1200.528297] RDX: 0000000000000002 RSI: 0000557fdb881c90 RDI:
0000000000000001
[ 1200.536258] RBP: 0000557fdb881c90 R08: 0000000000000000 R09:
00007f67ce9b14e0
[ 1200.544219] R10: 00007f67ce9b13e0 R11: 0000000000000246 R12:
0000000000000002
[ 1200.552181] R13: 00007f67ce9fb780 R14: 0000000000000002 R15:
00007f67ce9f69e0
[ 1200.560145] </TASK>
[ 1200.562581] Modules linked in: null_blk sunrpc vfat fat
intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal
intel_powerclamp coretemp dell_wmi_descriptor kvm_intel sparse_keymap
cdc_ether rfkill ipmi_ssif usbnet video kvm mei_me iTCO_wdt dcdbas mii
iTCO_vendor_support mei ipmi_si rapl mgag200 mxm_wmi pcspkr
intel_cstate ipmi_devintf acpi_power_meter ipmi_msghandler
intel_uncore i2c_algo_bit lpc_ich fuse xfs sd_mod sg nvme ahci libahci
crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel
nvme_core tg3 megaraid_sas nvme_auth t10_pi wmi dm_mirror
dm_region_hash dm_log dm_mod [last unloaded: null_blk]
[ 1200.624314] CR2: 0000000000000130
[ 1200.628012] ---[ end trace 0000000000000000 ]---
[ 1200.688370] RIP: 0010:_raw_spin_lock_irq+0x18/0x30
[ 1200.693747] Code: 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
90 90 f3 0f 1e fa 0f 1f 44 00 00 fa 65 ff 05 47 28 26 7c 31 c0 ba 01
00 00 00 <f0> 0f b1 17 75 05 c3 cc cc cc cc 89 c6 e8 46 01 00 00 90 c3
cc cc
[ 1200.714704] RSP: 0018:ffffb973431e7420 EFLAGS: 00010046
[ 1200.720535] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
0000000000000008
[ 1200.728498] RDX: 0000000000000001 RSI: 0000000000000000 RDI:
0000000000000130
[ 1200.736461] RBP: 0000000000000008 R08: ffff9645c44e4400 R09:
0000000000000000
[ 1200.744423] R10: 0000000000000014 R11: 0000000000000000 R12:
ffff964743a2d800
[ 1200.752384] R13: 0000000000000130 R14: ffff9645d05580f8 R15:
0000000000000001
[ 1200.760348] FS: 00007f67cea3b740(0000) GS:ffff964737a00000(0000)
knlGS:0000000000000000
[ 1200.769378] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1200.775789] CR2: 0000000000000130 CR3: 00000002819c4001 CR4:
00000000001706f0
[ 1200.783754] Kernel panic - not syncing: Fatal exception
[ 1200.789636] Kernel Offset: 0x2000000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[ 1200.856453] ---[ end Kernel panic - not syncing: Fatal exception ]---
Best Regards,
Yi Zhang
.
From ac365444523bb8dc15400fd1d395d483637ac249 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@xxxxxxxxxx>
Date: Sat, 10 Jun 2023 15:46:18 +0800
Subject: [PATCH] null_blk: fix null-ptr-dereference while configuring 'power'
and 'submit_queues'
Writing 'power' and 'submit_queues' concurrently will trigger kernel
panic:
Test script:
modprobe null_blk nr_devices=0
mkdir -p /sys/kernel/config/nullb/nullb0
while true; do echo 1 > submit_queues; echo 4 > submit_queues; done &
while true; do echo 1 > power; echo 0 > power; done
Test result:
BUG: kernel NULL pointer dereference, address: 0000000000000148
Oops: 0000 [#1] PREEMPT SMP
RIP: 0010:__lock_acquire+0x41d/0x28f0
Call Trace:
<TASK>
lock_acquire+0x121/0x450
down_write+0x5f/0x1d0
simple_recursive_removal+0x12f/0x5c0
blk_mq_debugfs_unregister_hctxs+0x7c/0x100
blk_mq_update_nr_hw_queues+0x4a3/0x720
nullb_update_nr_hw_queues+0x71/0xf0 [null_blk]
nullb_device_submit_queues_store+0x79/0xf0 [null_blk]
configfs_write_iter+0x119/0x1e0
vfs_write+0x326/0x730
ksys_write+0x74/0x150
This is because del_gendisk() can concurrent with
blk_mq_update_nr_hw_queues():
nullb_device_power_store nullb_apply_submit_queues
null_del_dev
del_gendisk
nullb_update_nr_hw_queues
if (!dev->nullb)
// still set while gendisk is deleted
return 0
blk_mq_update_nr_hw_queues
dev->nullb = NULL
Fix this problem by synchronize nullb_device_power_store() and
nullb_update_nr_hw_queues() with a mutex.
Fixes: 45919fbfe1c4 ("null_blk: Enable modifying 'submit_queues' after an instance has been configured")
Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
---
drivers/block/loop.c | 2 +-
drivers/block/null_blk/main.c | 40 +++++++++++++++++++++++------------
2 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 28a95fd366fe..3e1c4f5ef714 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2140,7 +2140,7 @@ static int loop_control_remove(int idx)
pr_warn_once("deleting an unspecified loop device is not supported.\n");
return -EINVAL;
}
-
+
/* Hide this loop device for serialization. */
ret = mutex_lock_killable(&loop_ctl_mutex);
if (ret)
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 5d56ad4ce01a..eb023d267369 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -413,13 +413,25 @@ static int nullb_update_nr_hw_queues(struct nullb_device *dev,
static int nullb_apply_submit_queues(struct nullb_device *dev,
unsigned int submit_queues)
{
- return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
+ int ret;
+
+ mutex_lock(&lock);
+ ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
+ mutex_unlock(&lock);
+
+ return ret;
}
static int nullb_apply_poll_queues(struct nullb_device *dev,
unsigned int poll_queues)
{
- return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
+ int ret;
+
+ mutex_lock(&lock);
+ ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
+ mutex_unlock(&lock);
+
+ return ret;
}
NULLB_DEVICE_ATTR(size, ulong, NULL);
@@ -468,28 +480,31 @@ static ssize_t nullb_device_power_store(struct config_item *item,
if (ret < 0)
return ret;
+ ret = count;
+ mutex_lock(&lock);
if (!dev->power && newp) {
if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
- return count;
+ goto out;
+
ret = null_add_dev(dev);
if (ret) {
clear_bit(NULLB_DEV_FL_UP, &dev->flags);
- return ret;
+ goto out;
}
set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
dev->power = newp;
} else if (dev->power && !newp) {
if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
- mutex_lock(&lock);
dev->power = newp;
null_del_dev(dev->nullb);
- mutex_unlock(&lock);
}
clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
}
- return count;
+out:
+ mutex_unlock(&lock);
+ return ret;
}
CONFIGFS_ATTR(nullb_device_, power);
@@ -1932,15 +1947,12 @@ static int null_add_dev(struct nullb_device *dev)
nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
- mutex_lock(&lock);
rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
- if (rv < 0) {
- mutex_unlock(&lock);
+ if (rv < 0)
goto out_cleanup_disk;
- }
+
nullb->index = rv;
dev->index = rv;
- mutex_unlock(&lock);
if (config_item_name(&dev->group.cg_item)) {
/* Use configfs dir name as the device name */
@@ -1969,9 +1981,7 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_ida_free;
- mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
- mutex_unlock(&lock);
pr_info("disk %s created\n", nullb->disk_name);
@@ -2020,7 +2030,9 @@ static int null_create_dev(void)
if (!dev)
return -ENOMEM;
+ mutex_lock(&lock);
ret = null_add_dev(dev);
+ mutex_unlock(&lock);
if (ret) {
null_free_dev(dev);
return ret;
--
2.39.2