The 07reshape5intr test is broke because of below path.
md_reap_sync_thread
-> mddev_unlock
-> md_unregister_thread(&mddev->sync_thread)
And md_check_recovery is triggered by,
mddev_unlock -> md_wakeup_thread(mddev->thread)
then mddev->reshape_position is set to MaxSector in raid5_finish_reshape
since MD_RECOVERY_INTR is cleared in md_check_recovery, which means
feature_map is not set with MD_FEATURE_RESHAPE_ACTIVE and superblock's
reshape_position can't be updated accordingly.
Since the bug which commit 8b48ec23cc51a ("md: don't unregister sync_thread
with reconfig_mutex held") fixed is related with action_store path, other
callers which reap sync_thread didn't need to be changed, let's
1. only unlock mddev in md_reap_sync_thread if caller is action_store,
so the parameter is renamed to reflect the change.
2. save some contexts (MD_RECOVERY_INTR and reshape_position) since they
could be changed by other processes, then restore them after get lock
again.
Fixes: 8b48ec23cc51a ("md: don't unregister sync_thread with reconfig_mutex held")
Reported-by: Logan Gunthorpe <logang@xxxxxxxxxxxx>
Signed-off-by: Guoqing Jiang <guoqing.jiang@xxxxxxxxx>
---
I suppose the previous bug still can be fixed with the change, but it is
better to verify it. Donald, could you help to test the new code?
Thanks,
Guoqing
drivers/md/md.c | 24 ++++++++++++++++++------
drivers/md/md.h | 2 +-
2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5c8efef13881..3387260dd55b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6197,7 +6197,7 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev, true);
+ md_reap_sync_thread(mddev, false);
}
del_timer_sync(&mddev->safemode_timer);
@@ -9303,7 +9303,7 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev, true);
+ md_reap_sync_thread(mddev, false);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
@@ -9338,7 +9338,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- md_reap_sync_thread(mddev, true);
+ md_reap_sync_thread(mddev, false);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -9411,18 +9411,30 @@ void md_check_recovery(struct mddev *mddev)
}
EXPORT_SYMBOL(md_check_recovery);
-void md_reap_sync_thread(struct mddev *mddev, bool reconfig_mutex_held)
+void md_reap_sync_thread(struct mddev *mddev, bool unlock_mddev)
{
struct md_rdev *rdev;
sector_t old_dev_sectors = mddev->dev_sectors;
+ sector_t old_reshape_position;
bool is_reshaped = false;
+ bool is_interrupted = false;
- if (reconfig_mutex_held)
+ if (unlock_mddev) {
+ old_reshape_position = mddev->reshape_position;
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ is_interrupted = true;
mddev_unlock(mddev);
+ }
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
- if (reconfig_mutex_held)
+ if (unlock_mddev) {
mddev_lock_nointr(mddev);
+ /* restore the previous flag and position */
+ mddev->reshape_position = old_reshape_position;
+ if (is_interrupted)
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ }