On 2022-05-11 03:10, Andrii Nakryiko wrote:
On Tue, May 10, 2022 at 12:21 PM Maxim Mikityanskiy <maximmi@xxxxxxxxxx> wrote:
On 2022-05-07 00:34, Andrii Nakryiko wrote:
On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy <maximmi@xxxxxxxxxx> wrote:
This commit adds selftests for the new BPF helpers:
bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}.
xdp_synproxy_kern.c is a BPF program that generates SYN cookies on
allowed TCP ports and sends SYNACKs to clients, accelerating synproxy
iptables module.
xdp_synproxy.c is a userspace control application that allows to
configure the following options in runtime: list of allowed ports, MSS,
window scale, TTL.
A selftest is added to prog_tests that leverages the above programs to
test the functionality of the new helpers.
Signed-off-by: Maxim Mikityanskiy <maximmi@xxxxxxxxxx>
Reviewed-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
selftests should use "selftests/bpf: " subject prefix, not "bpf: ",
please update so it's more obvious that this patch touches selftests
and not kernel-side BPF functionality.
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 5 +-
.../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++
.../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++
tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++
5 files changed, 1281 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 595565eb68c0..ca2f47f45670 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -43,3 +43,4 @@ test_cpp
*.tmp
xdpxceiver
xdp_redirect_multi
+xdp_synproxy
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index bafdc5373a13..8ae602843b16 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
- xdpxceiver xdp_redirect_multi
+ xdpxceiver xdp_redirect_multi xdp_synproxy
-TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
+TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy
# Emit succinct information message describing current building step
# $1 - generic step name (e.g., CC, LINK, etc);
@@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
cap_helpers.c
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \
$(OUTPUT)/liburandom_read.so \
+ $(OUTPUT)/xdp_synproxy \
this is the right way to make external binary available to test_progs
flavors, but is there anything inherently requiring external binary
instead of having a helper function doing the same? urandom_read has
to be a separate binary.
If you remember v1, it used to be a sample, but I was asked to convert
it to a selftest, because samples are deprecated. The intention of
having this separate binary is to have a sample reference implementation
that can be used in real-world scenarios with minor or no changes.
Ok, I'll let others chime in if they care enough about this. Selftests
are first and foremost a test and not an almost production-ready
collection of tools, but fine by me.
ima_setup.sh \
$(wildcard progs/btf_dump_test_case_*.c)
TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
new file mode 100644
index 000000000000..e08b28e25047
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#define SYS(cmd) ({ \
+ if (!ASSERT_OK(system(cmd), (cmd))) \
+ goto out; \
+})
+
+#define SYS_OUT(cmd) ({ \
+ FILE *f = popen((cmd), "r"); \
+ if (!ASSERT_OK_PTR(f, (cmd))) \
+ goto out; \
+ f; \
+})
+
+static bool expect_str(char *buf, size_t size, const char *str)
+{
+ if (size != strlen(str))
+ return false;
+ return !memcmp(buf, str, size);
+}
+
+void test_xdp_synproxy(void)
+{
+ int server_fd = -1, client_fd = -1, accept_fd = -1;
+ struct nstoken *ns = NULL;
+ FILE *ctrl_file = NULL;
+ char buf[1024];
+ size_t size;
+
+ SYS("ip netns add synproxy");
+
+ SYS("ip link add tmp0 type veth peer name tmp1");
+ SYS("ip link set tmp1 netns synproxy");
+ SYS("ip link set tmp0 up");
+ SYS("ip addr replace 198.18.0.1/24 dev tmp0");
+
+ // When checksum offload is enabled, the XDP program sees wrong
+ // checksums and drops packets.
+ SYS("ethtool -K tmp0 tx off");
+ // Workaround required for veth.
don't use C++ comments, please stick to /* */
+ SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null");
+
+ ns = open_netns("synproxy");
+ if (!ASSERT_OK_PTR(ns, "setns"))
+ goto out;
+
+ SYS("ip link set lo up");
+ SYS("ip link set tmp1 up");
+ SYS("ip addr replace 198.18.0.2/24 dev tmp1");
+ SYS("sysctl -w net.ipv4.tcp_syncookies=2");
+ SYS("sysctl -w net.ipv4.tcp_timestamps=1");
+ SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0");
+ SYS("iptables -t raw -I PREROUTING \
+ -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack");
+ SYS("iptables -t filter -A INPUT \
+ -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \
+ -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460");
+ SYS("iptables -t filter -A INPUT \
+ -i tmp1 -m state --state INVALID -j DROP");
+
+ ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \
+ --mss4 1460 --mss6 1440 --wscale 7 --ttl 64");
+ size = fread(buf, 1, sizeof(buf), ctrl_file);
buf is uninitialized so if fread fail strlen() can cause SIGSEGV or
some other failure mode
No, it will exit on the assert below (size won't be equal to strlen(str)).
it's better to use ASSERT_STREQ() which will also emit expected and
actual strings if they don't match. So maybe check size first, and
then ASSERT_STREQ() instead of custom expect_str() "helper"?
See below, the command's output is not a string.
If I extend my expect_str function to print the expected and actual
outputs, would that work for you?
+ pclose(ctrl_file);
+ if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"),
+ "initial SYNACKs"))
+ goto out;
+
+ server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0);
+ if (!ASSERT_GE(server_fd, 0, "start_server"))
+ goto out;
+
+ close_netns(ns);
+ ns = NULL;
+
+ client_fd = connect_to_fd(server_fd, 10000);
+ if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+ goto out;
+
+ accept_fd = accept(server_fd, NULL, NULL);
+ if (!ASSERT_GE(accept_fd, 0, "accept"))
+ goto out;
+
+ ns = open_netns("synproxy");
+ if (!ASSERT_OK_PTR(ns, "setns"))
+ goto out;
+
+ ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single");
+ size = fread(buf, 1, sizeof(buf), ctrl_file);
+ pclose(ctrl_file);
+ if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"),
+ "SYNACKs after connection"))
please use ASSERT_STREQ instead, same above
It doesn't fit here for two reasons:
* It doesn't consider size (and ignoring size will cause a UB on errors
because of the uninitialized buf).
* buf is not '\0'-terminated, and ASSERT_STREQ uses strcmp.
can it be non-zero-terminated in normal case? see above about checking
for errors separately
fread(buf, x, y, file) just reads up to x*y bytes into buf. It doesn't
treat it as a zero-terminated string, it can be any binary sequence of
bytes. So, yes, in normal case it's going to be some printable
characters WITHOUT any terminating '\0' (no one prints a '\0' to the
terminal in normal cases). In bad cases it could be anything, including
'\0' in the middle of the buffer, which would terminate strcmp early and
can cause false positives.
+ goto out;
+
+out:
+ if (accept_fd >= 0)
+ close(accept_fd);
+ if (client_fd >= 0)
+ close(client_fd);
+ if (server_fd >= 0)
+ close(server_fd);
+ if (ns)
+ close_netns(ns);
+
+ system("ip link del tmp0");
+ system("ip netns del synproxy");
+}
diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
new file mode 100644
index 000000000000..9ae85b189072
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
@@ -0,0 +1,750 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
Can you please elaborate on what Linux-OpenIB license is and why
GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR
BSD-2-Clause
That's the license boilerplate we use in the mlx5e driver. I'll check
with the relevant people whether we can submit it as GPL-2.0 solely.
ok
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <asm/errno.h>
+
[...]
+
+static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __u32 csum)
+{
+ __u64 s = csum;
+
+ s += (__u32)saddr;
+ s += (__u32)daddr;
+#if defined(__BIG_ENDIAN__)
+ s += proto + len;
+#elif defined(__LITTLE_ENDIAN__)
I've got few nudges in libbpf code base previously to use
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
instead (I don't remember the exact reason now, but there was a
reason). Let's do the same here for consistency?
OK.
samples/bpf/xdpsock_user.c also still uses __BIG_ENDIAN__.
+ s += (proto + len) << 8;
+#else
+#error Unknown endian
+#endif
+ s = (s & 0xffffffff) + (s >> 32);
+ s = (s & 0xffffffff) + (s >> 32);
+
+ return csum_fold((__u32)s);
+}
+
+static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __u32 csum)
+{
+ __u64 sum = csum;
+ int i;
+
+#pragma unroll
+ for (i = 0; i < 4; i++)
+ sum += (__u32)saddr->in6_u.u6_addr32[i];
+
+#pragma unroll
why unroll? BPF verifier handles such loops just fine, even if
compiler decides to not unroll them
Optimization, see csum_ipv6_magic in net/ipv6/ip6_checksum.c that has
this loop unrolled manually.
+ for (i = 0; i < 4; i++)
+ sum += (__u32)daddr->in6_u.u6_addr32[i];
+
+ // Don't combine additions to avoid 32-bit overflow.
+ sum += bpf_htonl(len);
+ sum += bpf_htonl(proto);
+
+ sum = (sum & 0xffffffff) + (sum >> 32);
+ sum = (sum & 0xffffffff) + (sum >> 32);
+
+ return csum_fold((__u32)sum);
+}
+
+static __always_inline __u64 tcp_clock_ns(void)
__always_inline isn't mandatory, you can just have static __u64
tcp_clock_ns() here and let compiler decide on inlining? same for
below
Do you mean just these three functions, or all functions below, or
actually all functions in this file?
It's not mandatory, but these are simple one-liners, it would be
unpleasant to waste an extra call in performance-critical code if the
compiler decides not to inline them.
my point was that it's not mandatory anymore. Given this is a hybrid
high-performance sample and selftest, I don't care. If it was just a
test, there is no point in micro-optimizing this (similar for loop
unrolling).
+{
+ return bpf_ktime_get_ns();
+}
+
+static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
+{
+ return ns / (NSEC_PER_SEC / TCP_TS_HZ);
+}
+
+static __always_inline __u32 tcp_time_stamp_raw(void)
+{
+ return tcp_ns_to_ts(tcp_clock_ns());
+}
+
[...]
[...]