On Fri, Nov 1, 2024 at 5:21 PM Eduard Zingerman <eddyz87@xxxxxxxxx> wrote: > > On Fri, 2024-11-01 at 14:18 +0100, Kumar Kartikeya Dwivedi wrote: > > [...] > > > I see that all selftests except one passed. The one that didn't > > appears to have been cancelled after running for an hour, and stalled > > after select_reuseport:OK. > > Looking at the LLVM 18 > > (https://github.com/kernel-patches/bpf/actions/runs/11621768944/job/32366412581?pr=7999) > > run instead of LLVM 17 > > (https://github.com/kernel-patches/bpf/actions/runs/11621768944/job/32366400714?pr=7999, > > which failed), it seems the next test send_signal_tracepoint. > > > > Is this known to be flaky? I'm guessing not and it is probably caused > > by my patch, but just want to confirm before I begin debugging. > > I suspect this is a test send_signal. > It started to hang for me yesterday w/o any apparent reason (on master branch). > I added workaround to avoid stalls, but this does not address the > issue with the test. Workaround follows. Hmm. Puranjay touched it last with extra logic. And before that David Vernet tried to address flakiness in commit 4a54de65964d. Yonghong also noticed lockups in paravirt and added workaround 7015843afc. Your additional timeout/workaround makes sense to me, but would be good to bisect whether Puranjay's change caused it. > --- > > diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c > index ee5a221b4103..4af127945417 100644 > --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c > +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c > @@ -18,6 +18,38 @@ static void sigusr1_siginfo_handler(int s, siginfo_t *i, void *v) > > static char log_buf[64 * 1024]; > > +static ssize_t read_with_timeout(int fd, void *buf, size_t count) > +{ > + struct timeval tv = { 1, 0 }; > + fd_set fds; > + int err; > + > + FD_ZERO(&fds); > + FD_SET(fd, &fds); > + err = select(fd + 1, &fds, NULL, NULL, &tv); > + if (!ASSERT_GE(err, 0, "read select")) > + return err; > + if (FD_ISSET(fd, &fds)) > + return read(fd, buf, count); > + return -EAGAIN; > +} > + > +static ssize_t write_with_timeout(int fd, void *buf, size_t count) > +{ > + struct timeval tv = { 1, 0 }; > + fd_set fds; > + int err; > + > + FD_ZERO(&fds); > + FD_SET(fd, &fds); > + err = select(fd + 1, NULL, &fds, NULL, &tv); > + if (!ASSERT_GE(err, 0, "write select")) > + return err; > + if (FD_ISSET(fd, &fds)) > + return write(fd, buf, count); > + return -EAGAIN; > +} > + > static void test_send_signal_common(struct perf_event_attr *attr, > bool signal_thread, bool remote) > { > @@ -75,10 +107,10 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > /* notify parent signal handler is installed */ > - ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); > + ASSERT_EQ(write_with_timeout(pipe_c2p[1], buf, 1), 1, "pipe_write"); > > /* make sure parent enabled bpf program to send_signal */ > - ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); > + ASSERT_EQ(read_with_timeout(pipe_p2c[0], buf, 1), 1, "pipe_read"); > > /* wait a little for signal handler */ > for (int i = 0; i < 1000000000 && !sigusr1_received; i++) { > @@ -94,10 +126,10 @@ static void test_send_signal_common(struct perf_event_attr *attr, > buf[0] = sigusr1_received; > > ASSERT_EQ(sigusr1_received, 8, "sigusr1_received"); > - ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); > + ASSERT_EQ(write_with_timeout(pipe_c2p[1], buf, 1), 1, "pipe_write"); > > /* wait for parent notification and exit */ > - ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); > + ASSERT_EQ(read_with_timeout(pipe_p2c[0], buf, 1), 1, "pipe_read"); > > /* restore the old priority */ > if (!remote) > @@ -158,7 +190,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > /* wait until child signal handler installed */ > - ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read"); > + ASSERT_EQ(read_with_timeout(pipe_c2p[0], buf, 1), 1, "pipe_read"); > > /* trigger the bpf send_signal */ > skel->bss->signal_thread = signal_thread; > @@ -172,7 +204,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > /* notify child that bpf program can send_signal now */ > - ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); > + ASSERT_EQ(write_with_timeout(pipe_p2c[1], buf, 1), 1, "pipe_write"); > > /* For the remote test, the BPF program is triggered from this > * process but the other process/thread is signaled. > @@ -188,7 +220,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > } > > /* wait for result */ > - err = read(pipe_c2p[0], buf, 1); > + err = read_with_timeout(pipe_c2p[0], buf, 1); > if (!ASSERT_GE(err, 0, "reading pipe")) > goto disable_pmu; > if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) { > @@ -199,7 +231,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, > ASSERT_EQ(buf[0], 8, "incorrect result"); > > /* notify child safe to exit */ > - ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); > + ASSERT_EQ(write_with_timeout(pipe_p2c[1], buf, 1), 1, "pipe_write"); > > disable_pmu: > close(pmu_fd); >