Re: [PATCH bpf-next v2] selftests/bpf: fix task_local_storage/exit_creds rcu usage

sdf@xxxxxxxxxx · Wed, 19 Oct 2022 12:57:48 -0700

On 10/19, Delyan Kratunov wrote:
BPF CI has revealed flakiness in the task_local_storage/exit_creds test.
The failure point in CI [1] is that null_ptr_count is equal to 0,
which indicates that the program hasn't run yet. This points to the
kern_sync_rcu (sys_membarrier -> synchronize_rcu underneath) not
waiting sufficiently.

Indeed, synchronize_rcu only waits for read-side sections that started
before the call. If the program execution starts *during* the
synchronize_rcu invocation (due to, say, preemption), the test won't
wait long enough.

As a speculative fix, make the synchornize_rcu calls in a loop until
an explicit run counter has gone up.

   [1]:  
https://github.com/kernel-patches/bpf/actions/runs/3268263235/jobs/5374940791

Signed-off-by: Delyan Kratunov <delyank@xxxxxx>
---
v1 -> v2:
Explicit loop counter and MAX_SYNC_RCU_CALLS guard.

  .../bpf/prog_tests/task_local_storage.c        | 18 +++++++++++++++---
  .../bpf/progs/task_local_storage_exit_creds.c  |  3 +++
  2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c  
b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 035c263aab1b..99a42a2b6e14 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -39,7 +39,8 @@ static void test_sys_enter_exit(void)
  static void test_exit_creds(void)
  {
  	struct task_local_storage_exit_creds *skel;
-	int err;
+	int err, run_count, sync_rcu_calls = 0;
+	const int MAX_SYNC_RCU_CALLS = 1000;

  	skel = task_local_storage_exit_creds__open_and_load();
  	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
@@ -53,8 +54,19 @@ static void test_exit_creds(void)
  	if (CHECK_FAIL(system("ls > /dev/null")))
  		goto out;

-	/* sync rcu to make sure exit_creds() is called for "ls" */
-	kern_sync_rcu();
+	/* kern_sync_rcu is not enough on its own as the read section we want
+	 * to wait for may start after we enter synchronize_rcu, so our call
+	 * won't wait for the section to finish. Loop on the run counter
+	 * as well to ensure the program has run.
+	 */
+	do {
+		kern_sync_rcu();
+		run_count = __atomic_load_n(&skel->bss->run_count, __ATOMIC_SEQ_CST);
+	} while (run_count == 0 && ++sync_rcu_calls < MAX_SYNC_RCU_CALLS);

Acked-by: Stanislav Fomichev <sdf@xxxxxxxxxx>

Might have been easier to do the following instead?

int sync_rcu_calls = 1000;
do {
} while (run_count == 0 && --sync_rcu_calls);


+
+	ASSERT_NEQ(sync_rcu_calls, MAX_SYNC_RCU_CALLS,
+		   "sync_rcu count too high");
+	ASSERT_NEQ(run_count, 0, "run_count");
  	ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count");
  	ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count");
  out:
diff --git  
a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c  
b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c
index 81758c0aef99..41d88ed222ff 100644
--- a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c
+++ b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c
@@ -14,6 +14,7 @@ struct {
  	__type(value, __u64);
  } task_storage SEC(".maps");

+int run_count = 0;
  int valid_ptr_count = 0;
  int null_ptr_count = 0;

@@ -28,5 +29,7 @@ int BPF_PROG(trace_exit_creds, struct task_struct *task)
  		__sync_fetch_and_add(&valid_ptr_count, 1);
  	else
  		__sync_fetch_and_add(&null_ptr_count, 1);
+
+	__sync_fetch_and_add(&run_count, 1);
  	return 0;
  }
--
2.37.3