Hi, Thank you, I tried your patch, but I got a deadlock. I reproduce it with script and some bad sectors on hard disks: #!/bin/sh mdadm -CR /dev/md0 -b none -c 64 -l5 -n4 /dev/sd[cbde] usleep 1800000 mdadm -S /dev/md0 <4>[ 74.835547] ====================================================== <4>[ 74.835548] [ INFO: possible circular locking dependency detected ] <4>[ 74.835550] 3.11.0-next-20130906+ #6 Not tainted <4>[ 74.835551] ------------------------------------------------------- <4>[ 74.835552] mdadm/3085 is trying to acquire lock: <4>[ 74.835560] (&mddev->reconfig_mutex){+.+.+.}, at: [<ffffffff81539751>] __md_stop_writes+0x41/0xa0 <4>[ 74.835561] <4>[ 74.835561] but task is already holding lock: <4>[ 74.835565] (&mddev->open_mutex){+.+.+.}, at: [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0 <4>[ 74.835565] <4>[ 74.835565] which lock already depends on the new lock. <4>[ 74.835565] <4>[ 74.835566] <4>[ 74.835566] the existing dependency chain (in reverse order) is: <4>[ 74.835569] <4>[ 74.835569] -> #1 (&mddev->open_mutex){+.+.+.}: <4>[ 74.835574] [<ffffffff810a8cc7>] lock_acquire+0x97/0x110 <4>[ 74.835578] [<ffffffff81743377>] mutex_lock_nested+0x47/0x3a0 <4>[ 74.835580] [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0 <4>[ 74.835583] [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0 <4>[ 74.835586] [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30 <4>[ 74.835589] [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0 <4>[ 74.835592] [<ffffffff81195b37>] block_ioctl+0x37/0x40 <4>[ 74.835597] [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550 <4>[ 74.835599] [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0 <4>[ 74.835602] [<ffffffff81750012>] system_call_fastpath+0x16/0x1b <4>[ 74.835604] <4>[ 74.835604] -> #0 (&mddev->reconfig_mutex){+.+.+.}: <4>[ 74.835607] [<ffffffff810a8b30>] __lock_acquire+0x1740/0x1840 <4>[ 74.835609] [<ffffffff810a8cc7>] lock_acquire+0x97/0x110 <4>[ 74.835612] [<ffffffff81743e13>] mutex_lock_interruptible_nested+0x53/0x410 <4>[ 74.835614] [<ffffffff81539751>] __md_stop_writes+0x41/0xa0 <4>[ 74.835626] [<ffffffff8153c71d>] do_md_stop+0x6d/0x4d0 <4>[ 74.835627] [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0 <4>[ 74.835629] [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30 <4>[ 74.835631] [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0 <4>[ 74.835633] [<ffffffff81195b37>] block_ioctl+0x37/0x40 <4>[ 74.835634] [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550 <4>[ 74.835636] [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0 <4>[ 74.835638] [<ffffffff81750012>] system_call_fastpath+0x16/0x1b <4>[ 74.835638] <4>[ 74.835638] other info that might help us debug this: <4>[ 74.835638] <4>[ 74.835639] Possible unsafe locking scenario: <4>[ 74.835639] <4>[ 74.835639] CPU0 CPU1 <4>[ 74.835640] ---- ---- <4>[ 74.835641] lock(&mddev->open_mutex); <4>[ 74.835642] lock(&mddev->reconfig_mutex); <4>[ 74.835644] lock(&mddev->open_mutex); <4>[ 74.835645] lock(&mddev->reconfig_mutex); <4>[ 74.835645] <4>[ 74.835645] *** DEADLOCK *** <4>[ 74.835645] <4>[ 74.835646] 1 lock held by mdadm/3085: <4>[ 74.835649] #0: (&mddev->open_mutex){+.+.+.}, at: [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0 <4>[ 74.835650] <4>[ 74.835650] stack backtrace: <4>[ 74.835652] CPU: 1 PID: 3085 Comm: mdadm Not tainted 3.11.0-next-20130906+ #6 <4>[ 74.835653] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015 02/27/2012 <4>[ 74.835655] ffffffff820ed440 ffff88013917ba58 ffffffff817425e6 0000000000000003 <4>[ 74.835657] ffffffff820ed5f0 ffff88013917baa8 ffffffff810a61d6 ffff88013917ba98 <4>[ 74.835659] ffff88013917bb08 ffff8800b7e4a0c0 ffff8800b7e4a7b8 ffff8800b7e4a0c0 <4>[ 74.835659] Call Trace: <4>[ 74.835661] [<ffffffff817425e6>] dump_stack+0x49/0x5b <4>[ 74.835663] [<ffffffff810a61d6>] print_circular_bug+0x216/0x310 <4>[ 74.835665] [<ffffffff810a8b30>] __lock_acquire+0x1740/0x1840 <4>[ 74.835668] [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70 <4>[ 74.835669] [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0 <4>[ 74.835671] [<ffffffff810a8cc7>] lock_acquire+0x97/0x110 <4>[ 74.835673] [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0 <4>[ 74.835674] [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70 <4>[ 74.835676] [<ffffffff81743e13>] mutex_lock_interruptible_nested+0x53/0x410 <4>[ 74.835678] [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0 <4>[ 74.835679] [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0 <4>[ 74.835681] [<ffffffff81539751>] __md_stop_writes+0x41/0xa0 <4>[ 74.835682] [<ffffffff8153c71d>] do_md_stop+0x6d/0x4d0 <4>[ 74.835684] [<ffffffff8153e8f7>] ? md_ioctl+0xf7/0x15c0 <4>[ 74.835686] [<ffffffff8110a693>] ? filemap_fdatawait+0x23/0x30 <4>[ 74.835688] [<ffffffff8110aa6c>] ? filemap_write_and_wait+0x6c/0x80 <4>[ 74.835689] [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0 <4>[ 74.835691] [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70 <4>[ 74.835693] [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0 <4>[ 74.835695] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10 <4>[ 74.835696] [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30 <4>[ 74.835698] [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0 <4>[ 74.835700] [<ffffffff81195b37>] block_ioctl+0x37/0x40 <4>[ 74.835702] [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550 <4>[ 74.835704] [<ffffffff81070bb0>] ? update_rmtp+0x80/0x80 <4>[ 74.835706] [<ffffffff81750037>] ? sysret_check+0x1b/0x56 <4>[ 74.835708] [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0 <4>[ 74.835710] [<ffffffff8130db2e>] ? trace_hardirqs_on_thunk+0x3a/0x3f <4>[ 74.835712] [<ffffffff81750012>] system_call_fastpath+0x16/0x1b <7>[ 74.835716] interrupting MD-thread pid 3076 <6>[ 74.937009] md: export_rdev(sdb) >On Mon, 2 Sep 2013 01:09:55 -0400 ycbzzjlby@xxxxxxxxx wrote: > >> From: Bian Yu <bianyu@xxxxxxxxxxx> >> >> When raid5 hit a fresh badblock, this badblock will flagged as unack >> badblock until md_update_sb is called. >> But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb >> in md_check_recovery, the badblock will always be unack, so raid5d >> thread enter a infinite loop and never can unregister sync_thread >> that cause deadlock. >> >> To solve this, before md_stop_writes call md_unregister_thread, set >> MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge >> MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev >> to wait md_update_sb. so raid5d thread can be stopped. >> >> I can reproduce it by using follow way: >> When raid5 array is recovering and hit a fresh badblock, then shutdown array at once. >> >> [ 480.850203] Not tainted 3.11.0-next-20130906+ #4 >> [ 480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. >> [ 480.854380] [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60 >> [ 480.854386] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40 >> [ 480.854395] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90 >> [ 480.854400] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10 >> [ 480.854405] [<ffffffff815370bf>] md_thread+0x11f/0x170 >> [ 480.854410] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90 >> [ 480.854415] [<ffffffff8106d956>] kthread+0xd6/0xe0 >> [ 480.854423] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70 >> [ 480.854428] [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0 >> [ 480.854432] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70 >> [ 480.854435] no locks held by md0_resync/3246. >> [ 480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds. >> [ 480.854438] Not tainted 3.11.0-next-20130906+ #4 >> [ 480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. >> [ 480.854442] mdadm D 0000000000000000 5024 3257 3209 0x00000080 >> [ 480.854445] ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120 >> [ 480.854447] ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8 >> [ 480.854449] ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8 >> [ 480.854449] Call Trace: >> [ 480.854452] [<ffffffff81745fa4>] schedule+0x24/0x70 >> [ 480.854453] [<ffffffff81742725>] schedule_timeout+0x175/0x200 >> [ 480.854455] [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130 >> [ 480.854457] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40 >> [ 480.854459] [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0 >> [ 480.854461] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10 >> [ 480.854463] [<ffffffff81746600>] wait_for_completion+0xa0/0x110 >> [ 480.854465] [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300 >> [ 480.854467] [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0 >> [ 480.854468] [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90 >> [ 480.854470] [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140 >> [ 480.854472] [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80 >> [ 480.854473] [<ffffffff8153c911>] do_md_stop+0x91/0x4d0 >> [ 480.854475] [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0 >> [ 480.854477] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10 >> [ 480.854479] [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0 >> [ 480.854481] [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920 >> >> Signed-off-by: Bian Yu <bianyu@xxxxxxxxxxx> >> --- >> drivers/md/md.c | 2 ++ >> drivers/md/md.h | 4 ++++ >> drivers/md/raid5.c | 3 ++- >> 3 files changed, 8 insertions(+), 1 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index adf4d7e..54ef71f 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev) >> static void __md_stop_writes(struct mddev *mddev) >> { >> set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); >> + set_bit(MD_STOPPING_WRITES, &mddev->flags); >> if (mddev->sync_thread) { >> set_bit(MD_RECOVERY_INTR, &mddev->recovery); >> md_reap_sync_thread(mddev); >> @@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev) >> mddev->in_sync = 1; >> md_update_sb(mddev, 1); >> } >> + clear_bit(MD_STOPPING_WRITES, &mddev->flags); >> } >> >> void md_stop_writes(struct mddev *mddev) >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 608050c..a24ae1d 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -214,6 +214,10 @@ struct mddev { >> #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since >> * md_ioctl checked on it. >> */ >> +#define MD_STOPPING_WRITES 5 /* If set, raid5 shouldn't set unacknowledged >> + * badblock blocked in analyse_stripe to avoid >> + * infinite loop. >> + */ >> >> int suspended; >> atomic_t active_io; >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c >> index f9972e2..ff1aecf 100644 >> --- a/drivers/md/raid5.c >> +++ b/drivers/md/raid5.c >> @@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) >> if (rdev) { >> is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, >> &first_bad, &bad_sectors); >> - if (s->blocked_rdev == NULL >> + if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags) >> + && s->blocked_rdev == NULL >> && (test_bit(Blocked, &rdev->flags) >> || is_bad < 0)) { >> if (is_bad < 0) > > >Thanks for including the extra details in the patch, but it wasn't only that >I didn't think it should be needed (I was wrong there). I also don't like >the patch. It isn't at all clear to me that it will do the right thing. >There is a reason that we block until the bad block lists has been updated, >and to just not block because it is inconvenient just doesn't seem right. > >I would rather change the sequencing so that the badblock list can be updated >at this point. >Something like that patch below, but I'm not 100% sure of it yet and I'm >about to go on leave so I'm not sure I'll be able to commit to it for a while. > >If you could review and test I would appreciate it. > >Thanks, >NeilBrown > >diff --git a/drivers/md/md.c b/drivers/md/md.c >index adf4d7e..e4d78c0 100644 >--- a/drivers/md/md.c >+++ b/drivers/md/md.c >@@ -3170,6 +3170,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > return -EACCES; > rv = mddev ? mddev_lock(mddev): -EBUSY; > if (!rv) { >+ mutex_lock(&mddev->open_mutex); >+ clear_bit(MD_STILL_CLOSED, &mddev->flags); >+ mutex_unlock(&mddev->open_mutex); > if (rdev->mddev == NULL) > rv = -EBUSY; > else >@@ -4764,6 +4767,9 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, > flush_workqueue(md_misc_wq); > rv = mddev_lock(mddev); > if (!rv) { >+ mutex_lock(&mddev->open_mutex); >+ clear_bit(MD_STILL_CLOSED, &mddev->flags); >+ mutex_unlock(&mddev->open_mutex); > rv = entry->store(mddev, page, length); > mddev_unlock(mddev); > } >@@ -5279,8 +5285,10 @@ static void __md_stop_writes(struct mddev *mddev) > { > set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); > if (mddev->sync_thread) { >+ mddev_unlock(&mddev); > set_bit(MD_RECOVERY_INTR, &mddev->recovery); > md_reap_sync_thread(mddev); >+ mddev_lock(&mddev); > } > > del_timer_sync(&mddev->safemode_timer); >@@ -5337,7 +5345,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) > err = -EBUSY; > goto out; > } >- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { >+ if (mddev->pers) >+ __md_stop_writes(mddev); >+ if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) { > /* Someone opened the device since we flushed it > * so page cache could be dirty and it is too late > * to flush. So abort >@@ -5346,8 +5356,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) > return -EBUSY; > } > if (mddev->pers) { >- __md_stop_writes(mddev); >- > err = -ENXIO; > if (mddev->ro==1) > goto out; >@@ -5379,7 +5387,9 @@ static int do_md_stop(struct mddev * mddev, int mode, > mutex_unlock(&mddev->open_mutex); > return -EBUSY; > } >- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { >+ if (mddev->pers) >+ __md_stop_writes(mddev); >+ if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) { > /* Someone opened the device since we flushed it > * so page cache could be dirty and it is too late > * to flush. So abort >@@ -5391,7 +5401,6 @@ static int do_md_stop(struct mddev * mddev, int mode, > if (mddev->ro) > set_disk_ro(disk, 0); > >- __md_stop_writes(mddev); > __md_stop(mddev); > mddev->queue->merge_bvec_fn = NULL; > mddev->queue->backing_dev_info.congested_fn = NULL; >?韬{.n?????%??檩??w?{.n???{炳盯w???塄}?财??j:+v??????2??璀??摺?囤??z夸z罐?+?????w棹f