Re: [PATCH bpf-next v3 1/1] selftests/bpf: Fix dangling stdout seen by traffic monitor thread

Martin KaFai Lau <martin.lau@xxxxxxxxx> · Fri, 28 Feb 2025 15:30:53 -0800

On 2/27/25 2:23 PM, Amery Hung wrote:
Traffic monitor thread may see dangling stdout as the main thread closes
and reassigns stdout without protection. This happens when the main thread
finishes one subtest and moves to another one in the same netns_new()
scope. Fix it by first consolidating stdout assignment into
stdio_restore_cleanup() and then protecting the use/close/reassignment of
stdout with a lock. The locking in the main thread is always performed
regradless of whether traffic monitor is running or not for simplicity.
It won't have any side-effect.

The issue can be reproduced by running test_progs repeatedly with traffic
monitor enabled:

for ((i=1;i<=100;i++)); do
    ./test_progs -a flow_dissector_skb* -m '*'
done

Signed-off-by: Amery Hung <ameryhung@xxxxxxxxx>
---
  tools/testing/selftests/bpf/network_helpers.c |  8 ++++-
  tools/testing/selftests/bpf/network_helpers.h |  6 ++--
  tools/testing/selftests/bpf/test_progs.c      | 29 +++++++++++++------
  3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 737a952dcf80..5014fd063d67 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -743,6 +743,7 @@ struct tmonitor_ctx {
  	pcap_t *pcap;
  	pcap_dumper_t *dumper;
  	pthread_t thread;
+	pthread_mutex_t *stdout_lock;
  	int wake_fd;
  
  	volatile bool done;
@@ -953,6 +954,7 @@ static void *traffic_monitor_thread(void *arg)
  		ifindex = ntohl(ifindex);
  		ptype = packet[10];
  
+		pthread_mutex_lock(ctx->stdout_lock);
  		if (proto == ETH_P_IPV6) {
  			show_ipv6_packet(payload, ifindex, ptype);
  		} else if (proto == ETH_P_IP) {
@@ -967,6 +969,7 @@ static void *traffic_monitor_thread(void *arg)
  			printf("%-7s %-3s Unknown network protocol type 0x%x\n",
  			       ifname, pkt_type_str(ptype), proto);
  		}
+		pthread_mutex_unlock(ctx->stdout_lock);
  	}
  
  	return NULL;
@@ -1055,7 +1058,8 @@ static void encode_test_name(char *buf, size_t len, const char *test_name, const
   * in the give network namespace.
   */
  struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
-					   const char *subtest_name)
+					   const char *subtest_name,
+					   pthread_mutex_t *stdout_lock)

Thinking out loud here and see if the following will be better than passing a 
pthread_mutex_t.

How about passing a print function pointer instead and this function can do what 
is needed before printf(), i.e. lock mutex here. Something like the 
"libbpf_print_fn_t __libbpf_pr" in libbpf.c.

May be the test_progs.c can set it once by calling tm_set_print (similar to 
libbpf_set_print) instead of passing as an arg during every traffic_monitor_start().

wdyt?

  {
  	struct nstoken *nstoken = NULL;
  	struct tmonitor_ctx *ctx;
@@ -1109,6 +1113,8 @@ struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_n
  		goto fail_eventfd;
  	}
  
+	ctx->stdout_lock = stdout_lock;
+
  	r = pthread_create(&ctx->thread, NULL, traffic_monitor_thread, ctx);
  	if (r) {
  		log_err("Failed to create thread");
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 9f6e05d886c5..b80954eab8d8 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -251,11 +251,13 @@ struct tmonitor_ctx;
  
  #ifdef TRAFFIC_MONITOR
  struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
-					   const char *subtest_name);
+					   const char *subtest_name,
+					   pthread_mutex_t *stdout_lock);
  void traffic_monitor_stop(struct tmonitor_ctx *ctx);
  #else
  static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
-							 const char *subtest_name)
+							 const char *subtest_name,
+							 pthread_mutex_t *stdout_lock)
  {
  	return NULL;
  }
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 0cb759632225..db9ea69e8ba1 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt)
  #endif
  }
  
-static void stdio_restore_cleanup(void)
+static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static void stdio_restore_cleanup(bool restore_default)
  {
  #ifdef __GLIBC__
  	if (verbose() && env.worker_id == -1) {
@@ -98,15 +100,25 @@ static void stdio_restore_cleanup(void)
  
  	fflush(stdout);
  
+	pthread_mutex_lock(&stdout_lock);
+
  	if (env.subtest_state) {
  		fclose(env.subtest_state->stdout_saved);
  		env.subtest_state->stdout_saved = NULL;
-		stdout = env.test_state->stdout_saved;
-		stderr = env.test_state->stdout_saved;
  	} else {
  		fclose(env.test_state->stdout_saved);
  		env.test_state->stdout_saved = NULL;
  	}
+
+	if (restore_default) {

Why a new "bool restore_default" is needed? Testing env.subtest_state is not enough?

Thanks for debugging this.

+		stdout = env.stdout_saved;
+		stderr = env.stderr_saved;
+	} else if (env.subtest_state) {
+		stdout = env.test_state->stdout_saved;
+		stderr = env.test_state->stdout_saved;
+	}
+
+	pthread_mutex_unlock(&stdout_lock);
  #endif
  }
  
@@ -121,10 +133,7 @@ static void stdio_restore(void)
  	if (stdout == env.stdout_saved)
  		return;
  
-	stdio_restore_cleanup();
-
-	stdout = env.stdout_saved;
-	stderr = env.stderr_saved;
+	stdio_restore_cleanup(true);
  #endif
  }
  
@@ -541,7 +550,8 @@ void test__end_subtest(void)
  				   test_result(subtest_state->error_cnt,
  					       subtest_state->skipped));
  
-	stdio_restore_cleanup();
+	stdio_restore_cleanup(false);
+
  	env.subtest_state = NULL;
  }
  
@@ -779,7 +789,8 @@ struct netns_obj *netns_new(const char *nsname, bool open)
  	    (env.subtest_state && env.subtest_state->should_tmon)) {
  		test_name = env.test->test_name;
  		subtest_name = env.subtest_state ? env.subtest_state->name : NULL;
-		netns_obj->tmon = traffic_monitor_start(nsname, test_name, subtest_name);
+		netns_obj->tmon = traffic_monitor_start(nsname, test_name, subtest_name,
+							&stdout_lock);
  		if (!netns_obj->tmon) {
  			fprintf(stderr, "Failed to start traffic monitor for %s\n", nsname);
  			goto fail;