Hi,
在 2024/05/20 18:38, Changhui Zhong 写道:
On Mon, May 20, 2024 at 10:55 AM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
Hi, Changhui
在 2024/05/20 8:39, Changhui Zhong 写道:
[czhong@vm linux-block]$ git bisect bad
060406c61c7cb4bbd82a02d179decca9c9bb3443 is the first bad commit
commit 060406c61c7cb4bbd82a02d179decca9c9bb3443
Author: Yu Kuai<yukuai3@xxxxxxxxxx>
Date: Thu May 9 20:38:25 2024 +0800
block: add plug while submitting IO
So that if caller didn't use plug, for example, __blkdev_direct_IO_simple()
and __blkdev_direct_IO_async(), block layer can still benefit from caching
nsec time in the plug.
Signed-off-by: Yu Kuai<yukuai3@xxxxxxxxxx>
Link:https://lore.kernel.org/r/20240509123825.3225207-1-yukuai1@xxxxxxxxxxxxxxx
Signed-off-by: Jens Axboe<axboe@xxxxxxxxx>
block/blk-core.c | 6 ++++++
1 file changed, 6 insertions(+)
Thanks for the test!
I was surprised to see this blamed commit, and after taking a look at
raid1 barrier code, I found that there are some known problems, fixed in
raid10, while raid1 still unfixed. So I wonder this patch maybe just
making the exist problem easier to reporduce.
I'll start cooking patches to sync raid10 fixes to raid1, meanwhile,
can you change your script to test raid10 as well, if raid10 is fine,
I'll give you these patches later to test raid1.
Thanks,
Kuai
Hi, Kuai
I tested raid10 and trigger this issue too,
Thanks for the test! Since raid10 has the same problem as well, then the
problem seems to be more common in raid. And related code to raid10 is
more simpler, attached is a patch to add debuginfo to raid10.
BTW, Xiao can reporduce the problem as well, and will lend a hand as
well.
Thanks,
Kuai
[ 332.435340] Create raid10
[ 332.573160] device-mapper: raid: Superblocks created for new raid set
[ 332.595273] md/raid10:mdX: not clean -- starting background reconstruction
[ 332.595277] md/raid10:mdX: active with 4 out of 4 devices
[ 332.597017] mdX: bitmap file is out of date, doing full recovery
[ 332.603712] md: resync of RAID array mdX
[ 492.173892] INFO: task mdX_resync:3092 blocked for more than 122 seconds.
[ 492.180694] Not tainted 6.9.0+ #1
[ 492.184536] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[ 492.192365] task:mdX_resync state:D stack:0 pid:3092
tgid:3092 ppid:2 flags:0x00004000
[ 492.192368] Call Trace:
[ 492.192370] <TASK>
[ 492.192371] __schedule+0x222/0x670
[ 492.192377] schedule+0x2c/0xb0
[ 492.192381] raise_barrier+0xc3/0x190 [raid10]
[ 492.192387] ? __pfx_autoremove_wake_function+0x10/0x10
[ 492.192392] raid10_sync_request+0x2c3/0x1ae0 [raid10]
[ 492.192397] ? __schedule+0x22a/0x670
[ 492.192398] ? prepare_to_wait_event+0x5f/0x190
[ 492.192401] md_do_sync+0x660/0x1040
[ 492.192405] ? __pfx_autoremove_wake_function+0x10/0x10
[ 492.192408] md_thread+0xad/0x160
[ 492.192410] ? __pfx_md_thread+0x10/0x10
[ 492.192411] kthread+0xdc/0x110
[ 492.192414] ? __pfx_kthread+0x10/0x10
[ 492.192416] ret_from_fork+0x2d/0x50
[ 492.192420] ? __pfx_kthread+0x10/0x10
[ 492.192421] ret_from_fork_asm+0x1a/0x30
[ 492.192424] </TASK>
Thanks,
Changhui
.
From d443a5bc43ecd41386e8e9103cd5e16914a1a2c3 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@xxxxxxxxxx>
Date: Mon, 20 May 2024 19:43:29 +0800
Subject: [PATCH] tmp
Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
---
drivers/md/raid10.c | 49 ++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 44 insertions(+), 5 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a4556d2e46bf..bc05d4ff5185 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -331,6 +331,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
*/
allow_barrier(conf);
+ printk("%s: r10_bio %px done\n", __func__, r10_bio);
free_r10bio(r10_bio);
}
@@ -899,6 +900,34 @@ static void flush_pending_writes(struct r10conf *conf)
spin_unlock_irq(&conf->device_lock);
}
+static bool barrier_waiting(struct r10conf *conf)
+{
+ if (conf->nr_waiting) {
+ printk("%s: nr_waiting %d\n", __func__, conf->nr_waiting);
+ return false;
+ }
+
+ return true;
+}
+
+static bool try_raise_barrier(struct r10conf *conf)
+{
+ int nr;
+
+ if (conf->barrier >= RESYNC_DEPTH) {
+ printk_ratelimited("%s: resync depth exceed limit\n", __func__);
+ return false;
+ }
+
+ nr = atomic_read(&conf->nr_pending);
+ if (nr) {
+ printk("%s: nr_pending %d\n", __func__, nr);
+ return false;
+ }
+
+ return true;
+}
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -929,14 +958,13 @@ static void raise_barrier(struct r10conf *conf, int force)
force = false;
/* Wait until no block IO is waiting (unless 'force') */
- wait_event_barrier(conf, force || !conf->nr_waiting);
+ wait_event_barrier(conf, force || barrier_waiting(conf));
/* block any new IO from starting */
WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
- wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
- conf->barrier < RESYNC_DEPTH);
+ wait_event_barrier(conf, try_raise_barrier(conf));
write_sequnlock_irq(&conf->resync_lock);
}
@@ -1006,8 +1034,10 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
{
bool ret = true;
- if (wait_barrier_nolock(conf))
+ if (wait_barrier_nolock(conf)) {
+ printk("%s: nr_pending: %d\n", __func__, atomic_read(&conf->nr_pending));
return true;
+ }
write_seqlock_irq(&conf->resync_lock);
if (conf->barrier) {
@@ -1024,9 +1054,12 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
wake_up(&conf->wait_barrier);
}
/* Only increment nr_pending when we wait */
- if (ret)
+ if (ret) {
atomic_inc(&conf->nr_pending);
+ printk("%s: nr_pending: %d\n", __func__, atomic_read(&conf->nr_pending));
+ }
write_sequnlock_irq(&conf->resync_lock);
+
return ret;
}
@@ -1035,6 +1068,8 @@ static void allow_barrier(struct r10conf *conf)
if ((atomic_dec_and_test(&conf->nr_pending)) ||
(conf->array_freeze_pending))
wake_up_barrier(conf);
+
+ printk("%s: nr_pending: %d\n", __func__, atomic_read(&conf->nr_pending));
}
static void freeze_array(struct r10conf *conf, int extra)
@@ -1187,6 +1222,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
return;
+
+ printk("%s: r10_bio %px start\n", __func__, r10_bio);
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
if (err_rdev) {
@@ -1374,6 +1411,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
sectors = r10_bio->sectors;
if (!regular_request_wait(mddev, conf, bio, sectors))
return;
+
+ printk("%s: r10_bio %px start\n", __func__, r10_bio);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
--
2.39.2