[PATCH 10/16] fuzzy: abort scrub stress testing if the scratch fs went down

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Fri, 30 Dec 2022 14:12:54 -0800

From: Darrick J. Wong <djwong@xxxxxxxxxx>

There's no point in continuing a stress test of online fsck if the
filesystem goes down.  We can't query that kind of state directly, so as
a proxy we try to stat the mountpoint and interpret any error return as
a sign that the fs is down.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
 common/fuzzy |   13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/common/fuzzy b/common/fuzzy
index 6519d5c1e2..f1bc2dc756 100644
--- a/common/fuzzy
+++ b/common/fuzzy
@@ -338,10 +338,17 @@ __stress_scrub_filter_output() {
 		    -e '/No space left on device/d'
 }
 
+# Decide if the scratch filesystem is still alive.
+__stress_scrub_scratch_alive() {
+	# If we can't stat the scratch filesystem, there's a reasonably good
+	# chance that the fs shut down, which is not good.
+	stat "$SCRATCH_MNT" &>/dev/null
+}
+
 # Decide if we want to keep running stress tests.  The first argument is the
 # stop time, and second argument is the path to the sentinel file.
 __stress_scrub_running() {
-	test -e "$2" && test "$(date +%s)" -lt "$1"
+	test -e "$2" && test "$(date +%s)" -lt "$1" && __stress_scrub_scratch_alive
 }
 
 # Run fs freeze and thaw in a tight loop.
@@ -486,6 +493,10 @@ _scratch_xfs_stress_scrub() {
 	done
 	_scratch_xfs_stress_scrub_cleanup
 
+	# Warn the user if we think the scratch filesystem went down.
+	__stress_scrub_scratch_alive || \
+		echo "Did the scratch filesystem die?"
+
 	echo "Loop finished at $(date)" >> $seqres.full
 }