Re: [PATCH 08/23] common: fix pkill by running test program in a separate session

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Tue, 28 Jan 2025 19:13:13 -0800

On Wed, Jan 29, 2025 at 07:39:22AM +1100, Dave Chinner wrote:
> On Mon, Jan 27, 2025 at 11:23:52PM -0800, Darrick J. Wong wrote:
> > On Tue, Jan 28, 2025 at 03:34:50PM +1100, Dave Chinner wrote:
> > > On Thu, Jan 23, 2025 at 12:16:50PM +1100, Dave Chinner wrote:
> > > 4. /tmp is still shared across all runner instances so all the
> > > 
> > >    concurrent runners dump all their tmp files in /tmp. However, the
> > >    runners no longer have unique PIDs (i.e. check runs as PID 3 in
> > >    all runner instaces). This means using /tmp/tmp.$$ as the
> > >    check/test temp file definition results is instant tmp file name
> > >    collisions and random things in check and tests fail.  check and
> > >    common/preamble have to be converted to use `mktemp` to provide
> > >    unique tempfile name prefixes again.
> > > 
> > > 5. Don't forget to revert the parent /proc mount back to shared
> > >    after check has finished running (or was aborted).
> > > 
> > > I think with this (current prototype patch below), we can use PID
> > > namespaces rather than process session IDs for check-parallel safe
> > > process management.
> > > 
> > > Thoughts?
> > 
> > Was about to go to bed, but can we also start a new mount namespace,
> > create a private (or at least non-global) /tmp to put files into, and
> > then each test instance is isolated from clobbering the /tmpfiles of
> > other ./check instances *and* the rest of the system?
> 
> We probably can. I didn't want to go down that rat hole straight
> away, because then I'd have to make a decision about what to mount
> there. One thing at a time....
> 
> I suspect that I can just use a tmpfs filesystem for it - there's
> heaps of memory available on my test machines and we don't use /tmp
> to hold large files, so that should work fine for me.  However, I'm
> a little concerned about what will happen when testing under memory
> pressure situations if /tmp needs memory to operate correctly.
> 
> I'll have a look at what is needed for private tmpfs /tmp instances
> to work - it should work just fine.
> 
> However, if check-parallel has taught me anything, it is that trying
> to use "should work" features on a modern system tends to mean "this
> is a poorly documented rat-hole that with many dead-ends that will
> be explored before a working solution is found"...

<nod> I'm running an experiment overnight with the following patch to
get rid of the session id grossness.  AFAICT it can also be used by
check-parallel to start its subprocesses in separate pid namespaces,
though I didn't investigate thoroughly.

I'm also not sure it's required for check-helper to unmount the /proc
that it creates; merely exiting seems to clean everything up? <shrug>

I also tried using systemd-nspawn to run fstests inside a "container"
but very quickly discovered that you can't pass block devices to the
container which makes fstests pretty useless for testing real scsi
devices. :/

--D

check: run tests in a private pid/mount namespace

Experiment with running tests in a private pid/mount namespace for
better isolation of the scheduler (check) vs the test (./$seq).  This
also makes it cleaner to adjust the oom score and is a convenient place
to set up the environment before invoking the test.

Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>
---
 check        |   30 +++++-------------------------
 check-helper |   31 +++++++++++++++++++++++++++++++
 common/rc    |    8 +++-----
 3 files changed, 39 insertions(+), 30 deletions(-)
 create mode 100755 check-helper

diff --git a/check b/check
index 00ee5af2a31e34..9c70f6f1e10110 100755
--- a/check
+++ b/check
@@ -698,41 +698,21 @@ _adjust_oom_score -500
 # systemd doesn't automatically remove transient scopes that fail to terminate
 # when systemd tells them to terminate (e.g. programs stuck in D state when
 # systemd sends SIGKILL), so we use reset-failed to tear down the scope.
-#
-# Use setsid to run the test program with a separate session id so that we
-# can pkill only the processes started by this test.
 _run_seq() {
-	local cmd=(bash -c "test -w ${OOM_SCORE_ADJ} && echo 250 > ${OOM_SCORE_ADJ}; exec setsid bash ./$seq")
+	local cmd=("./check-helper" "./$seq")
 
 	if [ -n "${HAVE_SYSTEMD_SCOPES}" ]; then
 		local unit="$(systemd-escape "fs$seq").scope"
 		systemctl reset-failed "${unit}" &> /dev/null
-		systemd-run --quiet --unit "${unit}" --scope "${cmd[@]}" &
-		CHILDPID=$!
-		wait
+		systemd-run --quiet --unit "${unit}" --scope "${cmd[@]}"
 		res=$?
-		unset CHILDPID
 		systemctl stop "${unit}" &> /dev/null
 		return "${res}"
 	else
 		# bash won't run the SIGINT trap handler while there are
 		# foreground children in a separate session, so we must run
 		# the test in the background and wait for it.
-		"${cmd[@]}" &
-		CHILDPID=$!
-		wait
-		unset CHILDPID
-	fi
-}
-
-_kill_seq() {
-	if [ -n "$CHILDPID" ]; then
-		# SIGPIPE will kill all the children (including fsstress)
-		# without bash logging fatal signal termination messages to the
-		# console
-		pkill -PIPE --session "$CHILDPID"
-		wait
-		unset CHILDPID
+		"${cmd[@]}"
 	fi
 }
 
@@ -741,9 +721,9 @@ _prepare_test_list
 fstests_start_time="$(date +"%F %T")"
 
 if $OPTIONS_HAVE_SECTIONS; then
-	trap "_kill_seq; _summary; exit \$status" 0 1 2 3 15
+	trap "_summary; exit \$status" 0 1 2 3 15
 else
-	trap "_kill_seq; _wrapup; exit \$status" 0 1 2 3 15
+	trap "_wrapup; exit \$status" 0 1 2 3 15
 fi
 
 function run_section()
diff --git a/check-helper b/check-helper
new file mode 100755
index 00000000000000..2cc8dbe5cfc791
--- /dev/null
+++ b/check-helper
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2025 Oracle.  All Rights Reserved.
+#
+# Try starting things in a private pid/mount namespace with a private /tmp
+# and /proc so that child process trees cannot interfere with each other.
+
+if [ -n "${IN_NSEXEC}" ]; then
+	for path in /proc /tmp; do
+		mount --make-private "$path"
+	done
+	unset IN_NSEXEC
+	mount -t proc proc /proc
+	mount -t tmpfs tmpfs /tmp
+
+	oom_knob="/proc/self/oom_score_adj"
+	test -w "${oom_knob}" && echo 250 > "${oom_knob}"
+
+	# Run the test, but don't let it be pid 1 because that will confuse
+	# the filter functions in common/dump.
+	"$@"
+	exit
+fi
+
+if [ -z "$1" ] || [ "$1" = "--help" ]; then
+	echo "Usage: $0 command [args...]"
+	exit 1
+fi
+
+IN_NSEXEC=1 exec "$(dirname "$0")/src/nsexec" -m -p $0 "$@"
diff --git a/common/rc b/common/rc
index 1c28a2d190f5a0..cc080ecaa9f801 100644
--- a/common/rc
+++ b/common/rc
@@ -33,13 +33,13 @@ _test_sync()
 # Kill only the test processes started by this test
 _pkill()
 {
-	pkill --session 0 "$@"
+	pkill "$@"
 }
 
 # Find only the test processes started by this test
 _pgrep()
 {
-	pgrep --session 0 "$@"
+	pgrep "$@"
 }
 
 # Common execution handling for fsstress invocation.
@@ -2817,11 +2817,9 @@ _require_user_exists()
 	[ "$?" == "0" ] || _notrun "$user user not defined."
 }
 
-# Run all non-root processes in the same session as the root.  Believe it or
-# not, passing $SHELL in this manner works both for "su" and "su -c cmd".
 _su()
 {
-	su --session-command $SHELL "$@"
+	su "$@"
 }
 
 # check if a user exists and is able to execute commands.