On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@xxxxxx> wrote: > > libbpf CI has reported send_signal test is flaky although > I am not able to reproduce it in my local environment. > But I am able to reproduce with on-demand libbpf CI ([1]). > > Through code analysis, the following is possible reason. > The failed subtest runs bpf program in softirq environment. > Since bpf_send_signal() only sends to a fork of "test_progs" > process. If the underlying current task is > not "test_progs", bpf_send_signal() will not be triggered > and the subtest will fail. > > To reduce the chances where the underlying process is not > the intended one, this patch boosted scheduling priority to > -20 (highest allowed by setpriority() call). And I did > 10 runs with on-demand libbpf CI with this patch and I > didn't observe any failures. > > [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- > .../selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++---- > .../bpf/progs/test_send_signal_kern.c | 3 +- > 2 files changed, 28 insertions(+), 8 deletions(-) > > diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c > index 41e158ae888e..0701c97456da 100644 > --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c > +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c > @@ -1,5 +1,7 @@ > // SPDX-License-Identifier: GPL-2.0 > #include <test_progs.h> > +#include <sys/time.h> > +#include <sys/resource.h> > #include "test_send_signal_kern.skel.h" > > int sigusr1_received = 0; > @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum) > } > > static void test_send_signal_common(struct perf_event_attr *attr, > - bool signal_thread) > + bool signal_thread, bool allow_skip) > { > struct test_send_signal_kern *skel; > int pipe_c2p[2], pipe_p2c[2]; > @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > if (pid == 0) { > + int old_prio; > + > /* install signal handler and notify parent */ > signal(SIGUSR1, sigusr1_handler); > > close(pipe_c2p[0]); /* close read */ > close(pipe_p2c[1]); /* close write */ > > + /* boost with a high priority so we got a higher chance > + * that if an interrupt happens, the underlying task > + * is this process. > + */ > + errno = 0; > + old_prio = getpriority(PRIO_PROCESS, 0); > + ASSERT_OK(errno, "getpriority"); > + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); > + > /* notify parent signal handler is installed */ > ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); > > @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, > /* wait for parent notification and exit */ > ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); > > + /* restore the old priority */ > + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); > + > close(pipe_c2p[1]); > close(pipe_p2c[0]); > exit(0); > @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr, > goto disable_pmu; > } > > - ASSERT_EQ(buf[0], '2', "incorrect result"); > - > /* notify child safe to exit */ > ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); > > + if (skel->bss->status == 0 && allow_skip) { > + printf("%s:SKIP\n", __func__); > + test__skip(); > + } else if (skel->bss->status != 1) { > + ASSERT_EQ(buf[0], '2', "incorrect result"); > + } > + > disable_pmu: > close(pmu_fd); > destroy_skel: > @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > > static void test_send_signal_tracepoint(bool signal_thread) > { > - test_send_signal_common(NULL, signal_thread); > + test_send_signal_common(NULL, signal_thread, false); > } > > static void test_send_signal_perf(bool signal_thread) > @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread) > .config = PERF_COUNT_SW_CPU_CLOCK, > }; > > - test_send_signal_common(&attr, signal_thread); > + test_send_signal_common(&attr, signal_thread, true); > } > > static void test_send_signal_nmi(bool signal_thread) > @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread) > close(pmu_fd); > } > > - test_send_signal_common(&attr, signal_thread); > + test_send_signal_common(&attr, signal_thread, true); > } > > void test_send_signal(void) > diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > index b4233d3efac2..59c05c422bbd 100644 > --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c > @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx) > ret = bpf_send_signal_thread(sig); > else > ret = bpf_send_signal(sig); > - if (ret == 0) > - status = 1; > + status = (ret == 0) ? 1 : 2; This doesn't make sense to me. status == 0 is the default value, it will stay 0 even if nothing is triggered, no BPF program is called, etc. If we are doing the skipping of the test logic (which I'd honestly just not do right now to see if we actually fixed the test), then I'd set status = 3 for the case when signal was triggered, but the current task is not test_progs. And only skip test if we get status 3. That is, status 0 and status 2 are bad (either not triggered, or some error when sending signal), 1 is OK, 3 is SKIP. But really, skipping a test that we couldn't randomly run doesn't feel good. Can you please leave the priority boosting part and drop the skipping part for now? > } > > return 0; > -- > 2.30.2 >