From: Darrick J. Wong <djwong@xxxxxxxxxx> Add a new knob, TEST_XFS_SCRUB_REBUILD, that makes it so that we use xfs_scrub to rebuild the ondisk metadata after every test. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- README | 3 ++ common/fuzzy | 1 + common/rc | 2 +- common/xfs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 1 deletion(-) diff --git a/README b/README index 4c4f22f853..744317625f 100644 --- a/README +++ b/README @@ -191,6 +191,9 @@ Extra XFS specification: to check the filesystem. As of August 2021, xfs_repair finds all filesystem corruptions found by xfs_check, and more, which means that xfs_check is no longer run by default. + - Set TEST_XFS_SCRUB_REBUILD=1 to have _check_xfs_filesystem run xfs_scrub in + "force_repair" mode to rebuild the filesystem; and xfs_repair -n to check + the results of the rebuilding. - xfs_scrub, if present, will always check the test and scratch filesystems if they are still online at the end of the test. It is no longer necessary to set TEST_XFS_SCRUB. diff --git a/common/fuzzy b/common/fuzzy index 14f7fdf03c..d8de55250d 100644 --- a/common/fuzzy +++ b/common/fuzzy @@ -975,6 +975,7 @@ __scratch_xfs_stress_setup_force_rebuild() { # and wait for 30*TIME_FACTOR seconds to see if the filesystem goes down. # Same requirements and arguments as _scratch_xfs_stress_scrub. _scratch_xfs_stress_online_repair() { + touch "$RESULT_DIR/.skip_orebuild" # no need to test online rebuild __scratch_xfs_stress_setup_force_rebuild XFS_SCRUB_FORCE_REPAIR=1 _scratch_xfs_stress_scrub "$@" } diff --git a/common/rc b/common/rc index 23530413ec..a1b65f0a7f 100644 --- a/common/rc +++ b/common/rc @@ -1685,7 +1685,7 @@ _require_scratch_nocheck() exit 1 fi fi - rm -f ${RESULT_DIR}/require_scratch + rm -f ${RESULT_DIR}/require_scratch "$RESULT_DIR/.skip_orebuild" } # we need the scratch device and it needs to not be an lvm device diff --git a/common/xfs b/common/xfs index 436569ba28..804047557b 100644 --- a/common/xfs +++ b/common/xfs @@ -692,6 +692,8 @@ _scratch_xfs_mdrestore() # run xfs_check and friends on a FS. _check_xfs_filesystem() { + local can_scrub= + if [ $# -ne 3 ]; then echo "Usage: _check_xfs_filesystem device <logdev>|none <rtdev>|none" 1>&2 exit 1 @@ -726,6 +728,8 @@ _check_xfs_filesystem() # Run online scrub if we can. mntpt="$(_is_dev_mounted $device)" if [ -n "$mntpt" ] && _supports_xfs_scrub "$mntpt" "$device"; then + can_scrub=1 + # Tests can create a scenario in which a call to syncfs() issued # at the end of the execution of the test script would return an # error code. xfs_scrub internally calls syncfs() before @@ -842,6 +846,79 @@ _check_xfs_filesystem() _mount_or_remount_rw "$extra_mount_options" $device $mountpoint fi + # If desired, test the online metadata rebuilding behavior if the + # filesystem was mounted when this function was called. + if [ -n "$TEST_XFS_SCRUB_REBUILD" ] && [ -n "$can_scrub" ] && [ ! -e "$RESULT_DIR/.skip_orebuild" ]; then + orebuild_ok=1 + + # Walk the entire directory tree to load directory blocks into + # memory and populate the dentry cache, which can speed up the + # repairs considerably when the directory tree is very large. + find $mntpt &>/dev/null & + + XFS_SCRUB_FORCE_REPAIR=1 "$XFS_SCRUB_PROG" -v -d $mntpt > $tmp.scrub 2>&1 + if [ $? -ne 0 ]; then + if grep -q 'No space left on device' $tmp.scrub; then + # It's not an error if the fs does not have + # enough space to complete a repair. We will + # check everything, though. + echo "*** XFS_SCRUB_FORCE_REPAIR=1 xfs_scrub -v -d ran out of space ***" >> $seqres.full + cat $tmp.scrub >> $seqres.full + echo "*** end xfs_scrub output" >> $seqres.full + else + _log_err "_check_xfs_filesystem: filesystem on $device failed scrub orebuild" + echo "*** XFS_SCRUB_FORCE_REPAIR=1 xfs_scrub -v -d output ***" >> $seqres.full + cat $tmp.scrub >> $seqres.full + echo "*** end xfs_scrub output" >> $seqres.full + ok=0 + orebuild_ok=0 + fi + fi + rm -f $tmp.scrub + + # Clear force_repair because xfs_scrub could have set it + $XFS_IO_PROG -x -c 'inject noerror' "$mntpt" >> $seqres.full + + "$XFS_SCRUB_PROG" -v -d -n $mntpt > $tmp.scrub 2>&1 + if [ $? -ne 0 ]; then + _log_err "_check_xfs_filesystem: filesystem on $device failed scrub orebuild recheck" + echo "*** xfs_scrub -v -d -n output ***" >> $seqres.full + cat $tmp.scrub >> $seqres.full + echo "*** end xfs_scrub output" >> $seqres.full + ok=0 + orebuild_ok=0 + fi + rm -f $tmp.scrub + + mountpoint=`_umount_or_remount_ro $device` + + $XFS_REPAIR_PROG -n $extra_options $extra_log_options $extra_rt_options $device >$tmp.repair 2>&1 + if [ $? -ne 0 ]; then + _log_err "_check_xfs_filesystem: filesystem on $device is inconsistent (orebuild-reverify)" + echo "*** xfs_repair -n output ***" >>$seqres.full + cat $tmp.repair >>$seqres.full + echo "*** end xfs_repair output" >>$seqres.full + + ok=0 + orebuild_ok=0 + fi + rm -f $tmp.repair + + if [ $ok -eq 0 ]; then + echo "*** mount output ***" >>$seqres.full + _mount >>$seqres.full + echo "*** end mount output" >>$seqres.full + elif [ "$type" = "xfs" ]; then + _mount_or_remount_rw "$extra_mount_options" $device $mountpoint + fi + + if [ "$orebuild_ok" -ne 1 ] && [ "$DUMP_CORRUPT_FS" = "1" ]; then + local flatdev="$(basename "$device")" + _xfs_metadump "$seqres.$flatdev.orebuild.md" "$device" \ + "$logdev" compress >> $seqres.full + fi + fi + if [ $ok -eq 0 ]; then status=1 if [ "$iam" != "check" ]; then