Re: [PATCH bpf-next v4 9/9] selftests/bpf: Test switching TCP Congestion Control algorithms.

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Wed, 8 Mar 2023 10:43:09 -0800

On Wed, Mar 8, 2023 at 10:10 AM Kui-Feng Lee <sinquersw@xxxxxxxxx> wrote:
>
>
>
> On 3/8/23 09:18, Andrii Nakryiko wrote:
> > On Wed, Mar 8, 2023 at 7:58 AM Kui-Feng Lee <sinquersw@xxxxxxxxx> wrote:
> >>
> >>
> >>
> >> On 3/7/23 17:10, Andrii Nakryiko wrote:
> >>> On Tue, Mar 7, 2023 at 3:34 PM Kui-Feng Lee <kuifeng@xxxxxxxx> wrote:
> >>>>
> >>>> Create a pair of sockets that utilize the congestion control algorithm
> >>>> under a particular name. Then switch up this congestion control
> >>>> algorithm to another implementation and check whether newly created
> >>>> connections using the same cc name now run the new implementation.
> >>>>
> >>>> Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxxxx>
> >>>> ---
> >>>>    .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 38 ++++++++++++
> >>>>    .../selftests/bpf/progs/tcp_ca_update.c       | 62 +++++++++++++++++++
> >>>>    2 files changed, 100 insertions(+)
> >>>>    create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>>
> >>>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> index e980188d4124..caaa9175ee36 100644
> >>>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >>>> @@ -8,6 +8,7 @@
> >>>>    #include "bpf_dctcp.skel.h"
> >>>>    #include "bpf_cubic.skel.h"
> >>>>    #include "bpf_tcp_nogpl.skel.h"
> >>>> +#include "tcp_ca_update.skel.h"
> >>>>    #include "bpf_dctcp_release.skel.h"
> >>>>    #include "tcp_ca_write_sk_pacing.skel.h"
> >>>>    #include "tcp_ca_incompl_cong_ops.skel.h"
> >>>> @@ -381,6 +382,41 @@ static void test_unsupp_cong_op(void)
> >>>>           libbpf_set_print(old_print_fn);
> >>>>    }
> >>>>
> >>>> +static void test_update_ca(void)
> >>>> +{
> >>>> +       struct tcp_ca_update *skel;
> >>>> +       struct bpf_link *link;
> >>>> +       int saved_ca1_cnt;
> >>>> +       int err;
> >>>> +
> >>>> +       skel = tcp_ca_update__open();
> >>>> +       if (!ASSERT_OK_PTR(skel, "open"))
> >>>> +               return;
> >>>> +
> >>>> +       err = tcp_ca_update__load(skel);
> >>>
> >>> tcp_ca_update__open_and_load()
> >>>
> >>>> +       if (!ASSERT_OK(err, "load")) {
> >>>> +               tcp_ca_update__destroy(skel);
> >>>> +               return;
> >>>> +       }
> >>>> +
> >>>> +       link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
> >>>
> >>> I think it's time to generate link holder for each struct_ops map to
> >>> the BPF skeleton, and support auto-attach of struct_ops skeleton.
> >>> Please do that as a follow up, once this patch set lands.
> >>
> >> Got it.
> >>
> >>>
> >>>> +       ASSERT_OK_PTR(link, "attach_struct_ops");
> >>>> +
> >>>> +       do_test("tcp_ca_update", NULL);
> >>>> +       saved_ca1_cnt = skel->bss->ca1_cnt;
> >>>> +       ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt");
> >>>> +
> >>>> +       err = bpf_link__update_map(link, skel->maps.ca_update_2);
> >>>> +       ASSERT_OK(err, "update_struct_ops");
> >>>> +
> >>>> +       do_test("tcp_ca_update", NULL);
> >>>> +       ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt");
> >>>> +       ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt");
> >>>
> >>> how do we know that struct_ops programs were triggered? what
> >>> guarantees that? if nothing, we are just adding another flaky
> >>> networking test
> >>
> >> When an ack is received, cong_control of ca_update_1 and ca_update_2
> >> will be called if they are activated.  By checking ca1_cnt & ca2_cnt, we
> >> know which one is activated.  Here, we check if the ca1_cnt keeps the
> >> same and ca2_cnt increase to make that ca_update_2 have replaced
> >> ca_update_1.
> >
> > I just don't see anything in the test ensuring that ack is
> > sent/received, so it seems like we are relying on some background
> > system activity and proper timing (unless I miss something, which is
> > why I'm asking), so this is fragile, as in CI environment timings and
> > background activity would be very different and unpredictable, causing
> > flakiness of the test
>
>
> The do_test() function creates two sockets to form a direct connection
> that must receive at least one acknowledgment packet for the sockets to
> progress into an ESTABLISHED state.  If they don't, that means it fails
> to establish a connection.

yeah, my bad, I *completely* missed `do_test("tcp_ca_update", NULL)`
(even though I went specifically looking for something like that).
It's all good here.

>
> >
> >>
> >>>
> >>>> +
> >>>> +       bpf_link__destroy(link);
> >>>> +       tcp_ca_update__destroy(skel);
> >>>> +}
> >>>> +
> >>>>    void test_bpf_tcp_ca(void)
> >>>>    {
> >>>>           if (test__start_subtest("dctcp"))
> >>>> @@ -399,4 +435,6 @@ void test_bpf_tcp_ca(void)
> >>>>                   test_incompl_cong_ops();
> >>>>           if (test__start_subtest("unsupp_cong_op"))
> >>>>                   test_unsupp_cong_op();
> >>>> +       if (test__start_subtest("update_ca"))
> >>>> +               test_update_ca();
> >>>>    }
> >>>> diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>> new file mode 100644
> >>>> index 000000000000..36a04be95df5
> >>>> --- /dev/null
> >>>> +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
> >>>> @@ -0,0 +1,62 @@
> >>>> +// SPDX-License-Identifier: GPL-2.0
> >>>> +
> >>>> +#include "vmlinux.h"
> >>>> +
> >>>> +#include <bpf/bpf_helpers.h>
> >>>> +#include <bpf/bpf_tracing.h>
> >>>> +
> >>>> +char _license[] SEC("license") = "GPL";
> >>>> +
> >>>> +int ca1_cnt = 0;
> >>>> +int ca2_cnt = 0;
> >>>> +
> >>>> +#define USEC_PER_SEC 1000000UL
> >>>> +
> >>>> +#define min(a, b) ((a) < (b) ? (a) : (b))
> >>>> +
> >>>> +static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> >>>> +{
> >>>> +       return (struct tcp_sock *)sk;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_1_cong_control")
> >>>> +void BPF_PROG(ca_update_1_cong_control, struct sock *sk,
> >>>> +             const struct rate_sample *rs)
> >>>> +{
> >>>> +       ca1_cnt++;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_2_cong_control")
> >>>> +void BPF_PROG(ca_update_2_cong_control, struct sock *sk,
> >>>> +             const struct rate_sample *rs)
> >>>> +{
> >>>> +       ca2_cnt++;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_ssthresh")
> >>>> +__u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
> >>>> +{
> >>>> +       return tcp_sk(sk)->snd_ssthresh;
> >>>> +}
> >>>> +
> >>>> +SEC("struct_ops/ca_update_undo_cwnd")
> >>>> +__u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
> >>>> +{
> >>>> +       return tcp_sk(sk)->snd_cwnd;
> >>>> +}
> >>>> +
> >>>> +SEC(".struct_ops.link")
> >>>> +struct tcp_congestion_ops ca_update_1 = {
> >>>> +       .cong_control = (void *)ca_update_1_cong_control,
> >>>> +       .ssthresh = (void *)ca_update_ssthresh,
> >>>> +       .undo_cwnd = (void *)ca_update_undo_cwnd,
> >>>> +       .name = "tcp_ca_update",
> >>>> +};
> >>>> +
> >>>> +SEC(".struct_ops.link")
> >>>> +struct tcp_congestion_ops ca_update_2 = {
> >>>> +       .cong_control = (void *)ca_update_2_cong_control,
> >>>> +       .ssthresh = (void *)ca_update_ssthresh,
> >>>> +       .undo_cwnd = (void *)ca_update_undo_cwnd,
> >>>> +       .name = "tcp_ca_update",
> >>>> +};
> >>>
> >>> please add a test where you combine both .struct_ops and
> >>> .struct_ops.link, it's an obvious potentially problematic combination
> >>>
> >>> as I mentioned in previous patches, let's also have a negative test
> >>> where bpf_link__update_map() fails
> >>
> >> Sure
> >>
> >>>
> >>>> --
> >>>> 2.34.1
> >>>>