On 3/24/20 10:57 PM, Joe Stringer wrote:
From: Lorenz Bauer <lmb@xxxxxxxxxxxxxx>
Attach a tc direct-action classifier to lo in a fresh network
namespace, and rewrite all connection attempts to localhost:4321
to localhost:1234 (for port tests) and connections to unreachable
IPv4/IPv6 IPs to the local socket (for address tests).
Keep in mind that both client to server and server to client traffic
passes the classifier.
Signed-off-by: Lorenz Bauer <lmb@xxxxxxxxxxxxxx>
Co-authored-by: Joe Stringer <joe@xxxxxxxxxxx>
Signed-off-by: Joe Stringer <joe@xxxxxxxxxxx>
---
v2: Rebase onto test_progs infrastructure
v1: Initial commit
---
tools/testing/selftests/bpf/Makefile | 2 +-
.../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++
.../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++
3 files changed, 372 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7729892e0b04..4f7f83d059ca 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
- test_lirc_mode2_user xdping test_cpp runqslower
+ test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
No test_sk_assign any more as the test is integrated into test_progs, right?
TEST_CUSTOM_PROGS = urandom_read
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
new file mode 100644
index 000000000000..1f0afcc20c48
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+// Copyright (c) 2020 Isovalent, Inc.
+/*
+ * Test that the socket assign program is able to redirect traffic towards a
+ * socket, regardless of whether the port or address destination of the traffic
+ * matches the port.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "test_progs.h"
+
+#define TEST_DPORT 4321
+#define TEST_DADDR (0xC0A80203)
+#define NS_SELF "/proc/self/ns/net"
+
+static __u32 duration;
+
+static bool configure_stack(int self_net)
self_net parameter is not used.
+{
+ /* Move to a new networking namespace */
+ if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+ return false;
You can use CHECK to encode better error messages. Thhis is what
most test_progs tests are using.
+ /* Configure necessary links, routes */
+ if (CHECK_FAIL(system("ip link set dev lo up")))
+ return false;
+ if (CHECK_FAIL(system("ip route add local default dev lo")))
+ return false;
+ if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
+ return false;
+
+ /* Load qdisc, BPF program */
+ if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+ return false;
+ if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
+ "object-file ./test_sk_assign.o section sk_assign_test")))
+ return false;
+
+ return true;
+}
+
+static int start_server(const struct sockaddr *addr, socklen_t len)
+{
+ int fd;
+
+ fd = socket(addr->sa_family, SOCK_STREAM, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(bind(fd, addr, len) == -1))
+ goto close_out;
+ if (CHECK_FAIL(listen(fd, 128) == -1))
+ goto close_out;
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static void handle_timeout(int signum)
+{
+ if (signum == SIGALRM)
+ fprintf(stderr, "Timed out while connecting to server\n");
+ kill(0, SIGKILL);
+}
+
+static struct sigaction timeout_action = {
+ .sa_handler = handle_timeout,
+};
+
+static int connect_to_server(const struct sockaddr *addr, socklen_t len)
+{
+ int fd = -1;
+
+ fd = socket(addr->sa_family, SOCK_STREAM, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
+ goto out;
should this goto close_out?
+ alarm(3);
+ if (CHECK_FAIL(connect(fd, addr, len) == -1))
+ goto close_out;
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static in_port_t get_port(int fd)
+{
+ struct sockaddr_storage name;
+ socklen_t len;
+ in_port_t port = 0;
+
+ len = sizeof(name);
+ if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
+ return port;
+
+ switch (name.ss_family) {
+ case AF_INET:
+ port = ((struct sockaddr_in *)&name)->sin_port;
+ break;
+ case AF_INET6:
+ port = ((struct sockaddr_in6 *)&name)->sin6_port;
+ break;
+ default:
+ CHECK(1, "Invalid address family", "%d\n", name.ss_family);
+ }
+ return port;
+}
+
+static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
+{
+ int client = -1, srv_client = -1;
+ char buf[] = "testing";
+ in_port_t port;
+ int ret = 1;
+
+ client = connect_to_server(addr, len);
+ if (client == -1) {
+ perror("Cannot connect to server");
+ goto out;
+ }
+
+ srv_client = accept(server_fd, NULL, NULL);
+ if (CHECK_FAIL(srv_client == -1)) {
+ perror("Can't accept connection");
+ goto out;
+ }
+ if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
+ perror("Can't write on client");
+ goto out;
+ }
+ if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
+ perror("Can't read on server");
+ goto out;
+ }
+
+ port = get_port(srv_client);
+ if (CHECK_FAIL(!port))
+ goto out;
+ if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
+ TEST_DPORT, ntohs(port)))
+ goto out;
+
+ ret = 0;
+out:
+ close(client);
+ close(srv_client);
+ return ret;
+}
+
+static int do_sk_assign(void)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ int server = -1;
+ int server_v6 = -1;
+ int err = 1;
+
+ memset(&addr4, 0, sizeof(addr4));
+ addr4.sin_family = AF_INET;
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr4.sin_port = htons(1234);
+
+ memset(&addr6, 0, sizeof(addr6));
+ addr6.sin6_family = AF_INET6;
+ addr6.sin6_addr = in6addr_loopback;
+ addr6.sin6_port = htons(1234);
+
+ server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
+ if (server == -1)
+ goto out;
+
+ server_v6 = start_server((const struct sockaddr *)&addr6,
+ sizeof(addr6));
+ if (server_v6 == -1)
+ goto out;
+
+ /* Connect to unbound ports */
+ addr4.sin_port = htons(TEST_DPORT);
+ addr6.sin6_port = htons(TEST_DPORT);
+
+ test__start_subtest("ipv4 port redir");
+ if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
+ goto out;
+
+ test__start_subtest("ipv6 port redir");
+ if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
+ goto out;
+
+ /* Connect to unbound addresses */
+ addr4.sin_addr.s_addr = htonl(TEST_DADDR);
+ addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
+
+ test__start_subtest("ipv4 addr redir");
+ if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
+ goto out;
+
+ test__start_subtest("ipv6 addr redir");
+ if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
+ goto out;
+
+ err = 0;
+out:
+ close(server);
+ close(server_v6);
+ return err;
+}
+
+void test_sk_assign(void)
+{
+ int self_net;
+
+ self_net = open(NS_SELF, O_RDONLY);
+ if (CHECK_FAIL(self_net < 0)) {
+ perror("Unable to open "NS_SELF);
+ return;
+ }
+
+ if (!configure_stack(self_net)) {
+ perror("configure_stack");
+ goto cleanup;
+ }
+
+ do_sk_assign();
+
+cleanup:
+ close(self_net);
Did we exit the newly unshared net namespace and restored the previous
namespace?
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
new file mode 100644
index 000000000000..7de30ad3f594
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Cloudflare Ltd.
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
+ void *data_end, __u16 eth_proto,
+ bool *ipv4)
+{
+ struct bpf_sock_tuple *result;
+ __u8 proto = 0;
+ __u64 ihl_len;
+
+ if (eth_proto == bpf_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+ if (iph + 1 > data_end)
+ return NULL;
+ if (iph->ihl != 5)
+ /* Options are not supported */
+ return NULL;
+ ihl_len = iph->ihl * 4;
+ proto = iph->protocol;
+ *ipv4 = true;
+ result = (struct bpf_sock_tuple *)&iph->saddr;
+ } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+ if (ip6h + 1 > data_end)
+ return NULL;
+ ihl_len = sizeof(*ip6h);
+ proto = ip6h->nexthdr;
+ *ipv4 = false;
+ result = (struct bpf_sock_tuple *)&ip6h->saddr;
+ } else {
+ return NULL;
+ }
+
+ if (result + 1 > data_end || proto != IPPROTO_TCP)
+ return NULL;
+
+ return result;
+}
+
+SEC("sk_assign_test")
+int bpf_sk_assign_test(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = (struct ethhdr *)(data);
+ struct bpf_sock_tuple *tuple, ln = {0};
+ struct bpf_sock *sk;
+ int tuple_len;
+ bool ipv4;
+ int ret;
+
+ if (eth + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
+ if (!tuple)
+ return TC_ACT_SHOT;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
You can get rid of tuple_len with
if (ipv4)
sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv4), ...);
else
sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv6), ...);
and later on you can do common bpf_skc_lookup_tcp.
But it may not be worthwhile to do it, as you have two separate calls
in the above instead.
+ if (sk) {
+ if (sk->state != BPF_TCP_LISTEN)
+ goto assign;
+
+ bpf_sk_release(sk);
+ }
+
+ if (ipv4) {
+ if (tuple->ipv4.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ ln.ipv4.daddr = bpf_htonl(0x7f000001);
+ ln.ipv4.dport = bpf_htons(1234);
+
+ sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+ } else {
+ if (tuple->ipv6.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ /* Upper parts of daddr are already zero. */
+ ln.ipv6.daddr[3] = bpf_htonl(0x1);
+ ln.ipv6.dport = bpf_htons(1234);
+
+ sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+ }
+
+ /* We can't do a single skc_lookup_tcp here, because then the compiler
+ * will likely spill tuple_len to the stack. This makes it lose all
+ * bounds information in the verifier, which then rejects the call as
+ * unsafe.
+ */
This is a known issue. For scalars, only constant is restored properly
in verifier at this moment. I did some hacking before to enable any
scalars. The fear is this will make pruning performs worse. More
study is needed here.
+ if (!sk)
+ return TC_ACT_SHOT;
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return TC_ACT_SHOT;
+ }
+
+assign:
+ ret = bpf_sk_assign(skb, sk, 0);
+ bpf_sk_release(sk);
+ return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
+}