From: Darrick J. Wong <djwong@xxxxxxxxxx> Make sure the daemon in charge of self healing xfs actually does what it says it does. Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> --- common/config | 6 ++++ common/systemd | 9 +++++ common/xfs | 16 ++++++++++ tests/xfs/1882 | 64 ++++++++++++++++++++++++++++++++++++++ tests/xfs/1882.out | 2 + tests/xfs/1883 | 75 +++++++++++++++++++++++++++++++++++++++++++++ tests/xfs/1883.out | 2 + tests/xfs/1884 | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/xfs/1884.out | 2 + 9 files changed, 263 insertions(+) create mode 100755 tests/xfs/1882 create mode 100644 tests/xfs/1882.out create mode 100755 tests/xfs/1883 create mode 100644 tests/xfs/1883.out create mode 100755 tests/xfs/1884 create mode 100644 tests/xfs/1884.out diff --git a/common/config b/common/config index fcff0660b05a97..2b3f946f3d308d 100644 --- a/common/config +++ b/common/config @@ -166,6 +166,12 @@ export XFS_ADMIN_PROG="$(type -P xfs_admin)" export XFS_GROWFS_PROG=$(type -P xfs_growfs) export XFS_SPACEMAN_PROG="$(type -P xfs_spaceman)" export XFS_SCRUB_PROG="$(type -P xfs_scrub)" +XFS_SCRUBBED_PROG="$(type -P xfs_scrubbed)" +# Normally the scrubbed daemon is installed in libexec +if [ -n "$XFS_SCRUBBED_PROG" ] && [ -e /usr/libexec/xfs_scrubbed ]; then + XFS_SCRUBBED_PROG=/usr/libexec/xfs_scrubbed +fi +export XFS_SCRUBBED_PROG export XFS_PARALLEL_REPAIR_PROG="$(type -P xfs_prepair)" export XFS_PARALLEL_REPAIR64_PROG="$(type -P xfs_prepair64)" export __XFSDUMP_PROG="$(type -P xfsdump)" diff --git a/common/systemd b/common/systemd index b2e24f267b2d93..8366d4cba39d85 100644 --- a/common/systemd +++ b/common/systemd @@ -71,3 +71,12 @@ _systemd_unit_status() { _systemd_installed || return 1 systemctl status "$1" } + +# Start a running systemd unit +_systemd_unit_start() { + systemctl start "$1" +} +# Stop a running systemd unit +_systemd_unit_stop() { + systemctl stop "$1" +} diff --git a/common/xfs b/common/xfs index b9e897e0e8839a..b4f69403e7396e 100644 --- a/common/xfs +++ b/common/xfs @@ -2224,3 +2224,19 @@ _scratch_find_rt_metadir_entry() { return 1 } + +# Run the xfs_scrubbed self healing daemon +_scratch_xfs_scrubbed() { + local scrubbed_args=() + local daemon_dir + daemon_dir=$(dirname "$XFS_SCRUBBED_PROG") + + # If we're being run from a development branch, we might need to find + # the schema file on our own. + local maybe_schema="$daemon_dir/../libxfs/xfs_healthmon.schema.json" + if [ -f "$maybe_schema" ]; then + scrubbed_args+=(--event-schema "$maybe_schema") + fi + + $XFS_SCRUBBED_PROG "${scrubbed_args[@]}" "$@" $SCRATCH_MNT +} diff --git a/tests/xfs/1882 b/tests/xfs/1882 new file mode 100755 index 00000000000000..b6a8bd545dbcf5 --- /dev/null +++ b/tests/xfs/1882 @@ -0,0 +1,64 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024-2025 Oracle. All Rights Reserved. +# +# FS QA Test 1882 +# +# Make sure that xfs_scrubbed correctly handles all the reports that it gets +# from the kernel. We simulate this by using the --everything mode so we get +# all the events, not just the sickness reports. +# +. ./common/preamble +_begin_fstest auto selfhealing + +. ./common/filter +. ./common/fuzzy +. ./common/systemd +. ./common/populate + +_require_scrub +_require_xfs_io_command "scrub" # online check support +_require_command "$XFS_SCRUBBED_PROG" "xfs_scrubbed" +_require_scratch + +# Does this fs support health monitoring? +_scratch_mkfs >> $seqres.full +_scratch_mount + +_scratch_xfs_scrubbed --check || \ + _notrun "health monitoring not supported on this kernel" +_scratch_xfs_scrubbed --require-validation --check && \ + _notrun "skipping this test in favor of the one that does json validation" +_scratch_unmount + +# Create a sample fs with all the goodies +_scratch_populate_cached nofill &>> $seqres.full +_scratch_mount + +# If the system xfsprogs has self healing enabled, we need to shut down the +# daemon before we try to capture things. +if _systemd_is_running; then + scratch_path=$(systemd-escape --path "$SCRATCH_MNT") + _systemd_unit_stop "xfs_scrubbed@${scratch_path}" &>> $seqres.full +fi + +# Start the health monitor, have it log everything +_scratch_xfs_scrubbed --everything --log > $tmp.scrubbed & +scrubbed_pid=$! +sleep 1 + +# Run scrub to make some noise +_scratch_scrub -b -n >> $seqres.full + +# Unmount fs to kill scrubbed, then wait for it to finish +while ! _scratch_unmount &>/dev/null; do + sleep 0.5 +done +kill $scrubbed_pid +wait + +cat $tmp.scrubbed >> $seqres.full + +echo Silence is golden +status=0 +exit diff --git a/tests/xfs/1882.out b/tests/xfs/1882.out new file mode 100644 index 00000000000000..9b31ccb735cabd --- /dev/null +++ b/tests/xfs/1882.out @@ -0,0 +1,2 @@ +QA output created by 1882 +Silence is golden diff --git a/tests/xfs/1883 b/tests/xfs/1883 new file mode 100755 index 00000000000000..9bba989386b37e --- /dev/null +++ b/tests/xfs/1883 @@ -0,0 +1,75 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024-2025 Oracle. All Rights Reserved. +# +# FS QA Test 1883 +# +# Make sure that xfs_scrubbed correctly validates the json events that it gets +# from the kernel. We simulate this by using the --everything mode so we get +# all the events, not just the sickness reports. +# +. ./common/preamble +_begin_fstest auto selfhealing + +. ./common/filter +. ./common/fuzzy +. ./common/systemd +. ./common/populate + +_require_scrub +_require_xfs_io_command "scrub" # online check support +_require_command "$XFS_SCRUBBED_PROG" "xfs_scrubbed" +_require_scratch + +# Does this fs support health monitoring? +_scratch_mkfs >> $seqres.full +_scratch_mount + +_scratch_xfs_scrubbed --require-validation --check || \ + _notrun "health monitoring with validation not supported on this kernel" +_scratch_unmount + +# Create a sample fs with all the goodies +_scratch_populate_cached nofill &>> $seqres.full +_scratch_mount + +# If the system xfsprogs has self healing enabled, we need to shut down the +# daemon before we try to capture things. +if _systemd_is_running; then + scratch_path=$(systemd-escape --path "$SCRATCH_MNT") + _systemd_unit_stop "xfs_scrubbed@${scratch_path}" &>> $seqres.full +fi + +# Start the health monitor, have it validate everything +_scratch_xfs_scrubbed --require-validation --everything --debug-fast --log &> $tmp.scrubbed & +scrubbed_pid=$! +sleep 1 + +# Run scrub to make some noise +_scratch_scrub -b -n >> $seqres.full + +# Wait for up to 60 seconds for the log file to stop growing +old_logsz= +new_logsz=$(stat -c '%s' $tmp.scrubbed) +for ((i = 0; i < 60; i++)); do + test "$old_logsz" = "$new_logsz" && break + old_logsz="$new_logsz" + sleep 1 + new_logsz=$(stat -c '%s' $tmp.scrubbed) +done + +# Unmount fs to kill scrubbed, then wait for it to finish +while ! _scratch_unmount &>/dev/null; do + sleep 0.5 +done +kill $scrubbed_pid +wait + +# Look for schema validation errors +grep -q 'not valid under any of the given schemas' $tmp.scrubbed && \ + echo "Should not have found schema validation errors" +cat $tmp.scrubbed >> $seqres.full + +echo Silence is golden +status=0 +exit diff --git a/tests/xfs/1883.out b/tests/xfs/1883.out new file mode 100644 index 00000000000000..bc9c390c778b6e --- /dev/null +++ b/tests/xfs/1883.out @@ -0,0 +1,2 @@ +QA output created by 1883 +Silence is golden diff --git a/tests/xfs/1884 b/tests/xfs/1884 new file mode 100755 index 00000000000000..fc6e0a48372fda --- /dev/null +++ b/tests/xfs/1884 @@ -0,0 +1,87 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024-2025 Oracle. All Rights Reserved. +# +# FS QA Test 1884 +# +# Ensure that autonomous self healing works fixes the filesystem correctly. +# +. ./common/preamble +_begin_fstest auto selfhealing + +. ./common/filter +. ./common/fuzzy +. ./common/systemd + +_require_scrub +_require_xfs_io_command "repair" # online repair support +_require_xfs_db_command "blocktrash" +_require_command "$XFS_SCRUBBED_PROG" "xfs_scrubbed" +_require_scratch + +_scratch_mkfs >> $seqres.full +_scratch_mount + +_xfs_has_feature $SCRATCH_MNT parent || \ + _notrun "parent pointers required to test directory auto-repair" +_scratch_xfs_scrubbed --repair --check || \ + _notrun "health monitoring with repair not supported on this kernel" + +# Create a largeish directory +dblksz=$(_xfs_get_dir_blocksize "$SCRATCH_MNT") +echo testdata > $SCRATCH_MNT/a +mkdir -p "$SCRATCH_MNT/some/victimdir" +for ((i = 0; i < (dblksz / 255); i++)); do + fname="$(printf "%0255d" "$i")" + ln $SCRATCH_MNT/a $SCRATCH_MNT/some/victimdir/$fname +done + +# Did we get at least two dir blocks? +dirsize=$(stat -c '%s' $SCRATCH_MNT/some/victimdir) +test "$dirsize" -gt "$dblksz" || echo "failed to create two-block directory" + +# Break the directory, remount filesystem +_scratch_unmount +_scratch_xfs_db -x \ + -c 'path /some/victimdir' \ + -c 'bmap' \ + -c 'dblock 1' \ + -c 'blocktrash -z -0 -o 0 -x 2048 -y 2048 -n 2048' >> $seqres.full +_scratch_mount + +# If the system xfsprogs has self healing enabled, we need to shut down the +# daemon before we try to capture things. +if _systemd_is_running; then + svcname="xfs_scrubbed@$(systemd-escape --path "$SCRATCH_MNT")" + echo "$svcname: $(systemctl is-active "$svcname")" >> $seqres.full + _systemd_unit_stop "$svcname" &>> $seqres.full +fi + +# Start the health monitor, have it repair everything reported corrupt +_scratch_xfs_scrubbed --repair --log > $tmp.scrubbed & +scrubbed_pid=$! +sleep 1 + +# Access the broken directory to trigger a repair, then poll the directory +# for 5 seconds to see if it gets fixed without us needing to intervene. +ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err +_filter_scratch < $tmp.err +try=0 +while [ $try -lt 50 ] && grep -q 'Structure needs cleaning' $tmp.err; do + echo "try $try saw corruption" >> $seqres.full + sleep 0.1 + ls $SCRATCH_MNT/some/victimdir > /dev/null 2> $tmp.err + try=$((try + 1)) +done +_filter_scratch < $tmp.err + +# Unmount fs to kill scrubbed, then wait for it to finish. +while ! _scratch_unmount &>/dev/null; do + sleep 0.5 +done +kill $scrubbed_pid +wait +cat $tmp.scrubbed >> $seqres.full + +status=0 +exit diff --git a/tests/xfs/1884.out b/tests/xfs/1884.out new file mode 100644 index 00000000000000..929e33da01f92c --- /dev/null +++ b/tests/xfs/1884.out @@ -0,0 +1,2 @@ +QA output created by 1884 +ls: reading directory 'SCRATCH_MNT/some/victimdir': Structure needs cleaning