This commit adds a sample for the new BPF helpers: bpf_ct_lookup_tcp, bpf_tcp_raw_gen_syncookie and bpf_tcp_raw_check_syncookie. samples/bpf/syncookie_kern.c is a BPF program that generates SYN cookies on allowed TCP ports and sends SYNACKs to clients, accelerating synproxy iptables module. samples/bpf/syncookie_user.c is a userspace control application that allows to configure the following options in runtime: list of allowed ports, MSS, window scale, TTL. samples/bpf/syncookie_test.sh is a script that demonstrates the setup of synproxy with XDP acceleration. Signed-off-by: Maxim Mikityanskiy <maximmi@xxxxxxxxxx> Reviewed-by: Tariq Toukan <tariqt@xxxxxxxxxx> --- samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 3 + samples/bpf/syncookie_kern.c | 591 ++++++++++++++++++++++++++++++++++ samples/bpf/syncookie_test.sh | 55 ++++ samples/bpf/syncookie_user.c | 388 ++++++++++++++++++++++ 5 files changed, 1038 insertions(+) create mode 100644 samples/bpf/syncookie_kern.c create mode 100755 samples/bpf/syncookie_test.sh create mode 100644 samples/bpf/syncookie_user.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0e7bfdbff80a..6b74e835d323 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -61,3 +61,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +syncookie diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4c5ad15f8d28..59d90c76bfea 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -53,6 +53,7 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += syncookie tprogs-y += xdp_redirect_cpu tprogs-y += xdp_redirect_map_multi @@ -118,6 +119,7 @@ task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) +syncookie-objs := syncookie_user.o xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o $(XDP_SAMPLE) xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) @@ -181,6 +183,7 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += syncookie_kern.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/syncookie_kern.c b/samples/bpf/syncookie_kern.c new file mode 100644 index 000000000000..b581ae30b650 --- /dev/null +++ b/samples/bpf/syncookie_kern.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <stdbool.h> +#include <stddef.h> + +#include <uapi/linux/errno.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/in.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/netfilter/nf_conntrack_common.h> +#include <linux/minmax.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEFAULT_MSS4 1460 +#define DEFAULT_MSS6 1440 +#define DEFAULT_WSCALE 7 +#define DEFAULT_TTL 64 +#define MAX_ALLOWED_PORTS 8 + +struct bpf_map_def SEC("maps") values = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64), + .max_entries = 2, +}; + +struct bpf_map_def SEC("maps") allowed_ports = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u16), + .max_entries = MAX_ALLOWED_PORTS, +}; + +#define IP_DF 0x4000 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff + +#define NEXTHDR_TCP 6 + +#define TCPOPT_NOP 1 +#define TCPOPT_EOL 0 +#define TCPOPT_MSS 2 +#define TCPOPT_WINDOW 3 +#define TCPOPT_SACK_PERM 4 +#define TCPOPT_TIMESTAMP 8 + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 + +#define IPV4_MAXLEN 60 +#define TCP_MAXLEN 60 + +static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) +{ + __u8 tmp[ETH_ALEN]; + + __builtin_memcpy(tmp, a, ETH_ALEN); + __builtin_memcpy(a, b, ETH_ALEN); + __builtin_memcpy(b, tmp, ETH_ALEN); +} + +static __always_inline __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (__u16)~csum; +} + +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, + __u32 csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; +#if defined(__BIG_ENDIAN__) + s += proto + len; +#elif defined(__LITTLE_ENDIAN__) + s += (proto + len) << 8; +#else +#error Unknown endian +#endif + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __u32 csum) +{ + __u64 sum = csum; + int i; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)saddr->s6_addr32[i]; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)daddr->s6_addr32[i]; + + // Don't combine additions to avoid 32-bit overflow. + sum += bpf_htonl(len); + sum += bpf_htonl(proto); + + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + + return csum_fold((__u32)sum); +} + +static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, + __u8 *ttl, bool ipv6) +{ + __u32 key = 0; + __u64 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value && *value != 0) { + if (ipv6) + *mss = (*value >> 32) & 0xffff; + else + *mss = *value & 0xffff; + *wscale = (*value >> 16) & 0xf; + *ttl = (*value >> 24) & 0xff; + return; + } + + *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; + *wscale = DEFAULT_WSCALE; + *ttl = DEFAULT_TTL; +} + +static __always_inline void values_inc_synacks(void) +{ + __u32 key = 1; + __u32 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value) + __sync_fetch_and_add(value, 1); +} + +static __always_inline bool check_port_allowed(__u16 port) +{ + __u32 i; + + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { + __u32 key = i; + __u16 *value; + + value = bpf_map_lookup_elem(&allowed_ports, &key); + + if (!value) + break; + // 0 is a terminator value. Check it first to avoid matching on + // a forbidden port == 0 and returning true. + if (*value == 0) + break; + + if (*value == port) + return true; + } + + return false; +} + +struct header_pointers { + struct ethhdr *eth; + struct iphdr *ipv4; + struct ipv6hdr *ipv6; + struct tcphdr *tcp; + __u16 tcp_len; +}; + +static __always_inline int tcp_dissect(void *data, void *data_end, + struct header_pointers *hdr) +{ + hdr->eth = data; + if (hdr->eth + 1 > data_end) + return XDP_DROP; + + switch (bpf_ntohs(hdr->eth->h_proto)) { + case ETH_P_IP: + hdr->ipv6 = NULL; + + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv4 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) + return XDP_DROP; + if (hdr->ipv4->version != 4) + return XDP_DROP; + + if (hdr->ipv4->protocol != IPPROTO_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + break; + case ETH_P_IPV6: + hdr->ipv4 = NULL; + + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv6 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv6->version != 6) + return XDP_DROP; + + // XXX: Extension headers are not supported and could circumvent + // XDP SYN flood protection. + if (hdr->ipv6->nexthdr != NEXTHDR_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); + break; + default: + // XXX: VLANs will circumvent XDP SYN flood protection. + return XDP_PASS; + } + + if (hdr->tcp + 1 > data_end) + return XDP_DROP; + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) + return XDP_DROP; + + return XDP_TX; +} + +static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, + __u8 wscale) +{ + __be32 *start = buf; + + *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + + if (!tsopt) + return buf - start; + + if (tsopt[0] & bpf_htonl(1 << 4)) + *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *buf++ = tsopt[0]; + *buf++ = tsopt[1]; + + if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + wscale); + + return buf - start; +} + +static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, + __u32 cookie, __be32 *tsopt, + __u16 mss, __u8 wscale) +{ + void *tcp_options; + + tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) + tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; + tcp_header->doff = 5; // doff is part of tcp_flag_word. + swap(tcp_header->source, tcp_header->dest); + tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); + tcp_header->seq = bpf_htonl(cookie); + tcp_header->window = 0; + tcp_header->urg_ptr = 0; + tcp_header->check = 0; // Rely on hardware checksum offload. + + tcp_options = (void *)(tcp_header + 1); + tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); +} + +static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, false); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv4->saddr, hdr->ipv4->daddr); + hdr->ipv4->check = 0; // Rely on hardware checksum offload. + hdr->ipv4->tos = 0; + hdr->ipv4->id = 0; + hdr->ipv4->ttl = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); +} + +static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, true); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv6->saddr, hdr->ipv6->daddr); + *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); + hdr->ipv6->hop_limit = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); +} + +static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, + struct xdp_md *ctx, + void *data, void *data_end) +{ + __u32 old_pkt_size, new_pkt_size; + // Unlike clang 10, clang 11 and 12 generate code that doesn't pass the + // BPF verifier if tsopt is not volatile. Volatile forces it to store + // the pointer value and use it directly, otherwise tcp_mkoptions is + // (mis)compiled like this: + // if (!tsopt) + // return buf - start; + // reg = stored_return_value_of_bpf_tcp_raw_gen_tscookie; + // if (reg == 0) + // tsopt = tsopt_buf; + // else + // tsopt = NULL; + // ... + // *buf++ = tsopt[1]; + // It creates a dead branch where tsopt is assigned NULL, but the + // verifier can't prove it's dead and blocks the program. + __be32 * volatile tsopt = NULL; + __be32 tsopt_buf[2]; + void *ip_header; + __u16 ip_len; + __u32 cookie; + __s64 value; + + if (hdr->ipv4) { + // Check the IPv4 and TCP checksums before creating a SYNACK. + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_fold(value) != 0) + return XDP_DROP; // Bad IPv4 checksum. + + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; // Bad TCP checksum. + + ip_header = hdr->ipv4; + ip_len = sizeof(*hdr->ipv4); + } else if (hdr->ipv6) { + // Check the TCP checksum before creating a SYNACK. + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; // Bad TCP checksum. + + ip_header = hdr->ipv6; + ip_len = sizeof(*hdr->ipv6); + } else { + return XDP_ABORTED; + } + + // Issue SYN cookies on allowed ports, drop SYN packets on + // blocked ports. + if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) + return XDP_DROP; + + value = bpf_tcp_raw_gen_syncookie(ip_header, ip_len, + (void *)hdr->tcp, hdr->tcp_len); + if (value < 0) + return XDP_ABORTED; + cookie = (__u32)value; + + if (bpf_tcp_raw_gen_tscookie((void *)hdr->tcp, hdr->tcp_len, + tsopt_buf, sizeof(tsopt_buf)) == 0) + tsopt = tsopt_buf; + + // Check that there is enough space for a SYNACK. It also covers + // the check that the destination of the __builtin_memmove below + // doesn't overflow. + if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + if (hdr->ipv4) { + if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { + struct tcphdr *new_tcp_header; + + new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); + __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); + hdr->tcp = new_tcp_header; + + hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; + } + + tcpv4_gen_synack(hdr, cookie, tsopt); + } else if (hdr->ipv6) { + tcpv6_gen_synack(hdr, cookie, tsopt); + } else { + return XDP_ABORTED; + } + + // Recalculate checksums. + hdr->tcp->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (hdr->ipv4) { + hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, + hdr->ipv4->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + + hdr->ipv4->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); + if (value < 0) + return XDP_ABORTED; + hdr->ipv4->check = csum_fold(value); + } else if (hdr->ipv6) { + hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, + &hdr->ipv6->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + } else { + return XDP_ABORTED; + } + + // Set the new packet size. + old_pkt_size = data_end - data; + new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; + if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) + return XDP_ABORTED; + + values_inc_synacks(); + + return XDP_TX; +} + +static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) +{ + int err; + + if (hdr->ipv4) + err = bpf_tcp_raw_check_syncookie(hdr->ipv4, sizeof(*hdr->ipv4), + (void *)hdr->tcp, hdr->tcp_len); + else if (hdr->ipv6) + err = bpf_tcp_raw_check_syncookie(hdr->ipv6, sizeof(*hdr->ipv6), + (void *)hdr->tcp, hdr->tcp_len); + else + return XDP_ABORTED; + if (err) + return XDP_DROP; + + return XDP_PASS; +} + +SEC("xdp/syncookie") +int syncookie_xdp(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct header_pointers hdr; + struct bpf_sock_tuple tup; + struct bpf_nf_conn *ct; + __u32 tup_size; + __s64 value; + int ret; + + ret = tcp_dissect(data, data_end, &hdr); + if (ret != XDP_TX) + return ret; + + if (hdr.ipv4) { + // TCP doesn't normally use fragments, and XDP can't reassemble them. + if ((hdr.ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) + return XDP_DROP; + + tup.ipv4.saddr = hdr.ipv4->saddr; + tup.ipv4.daddr = hdr.ipv4->daddr; + tup.ipv4.sport = hdr.tcp->source; + tup.ipv4.dport = hdr.tcp->dest; + tup_size = sizeof(tup.ipv4); + } else if (hdr.ipv6) { + __builtin_memcpy(tup.ipv6.saddr, &hdr.ipv6->saddr, sizeof(tup.ipv6.saddr)); + __builtin_memcpy(tup.ipv6.daddr, &hdr.ipv6->daddr, sizeof(tup.ipv6.daddr)); + tup.ipv6.sport = hdr.tcp->source; + tup.ipv6.dport = hdr.tcp->dest; + tup_size = sizeof(tup.ipv6); + } else { + // The verifier can't track that either ipv4 or ipv6 is not NULL. + return XDP_ABORTED; + } + + value = 0; // Flags. + ct = bpf_ct_lookup_tcp(ctx, &tup, tup_size, BPF_F_CURRENT_NETNS, &value); + if (ct) { + unsigned long status = ct->status; + + bpf_ct_release(ct); + if (status & IPS_CONFIRMED_BIT) + return XDP_PASS; + } else if (value != -ENOENT) { + return XDP_ABORTED; + } + + // value == -ENOENT || !(status & IPS_CONFIRMED_BIT) + + if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1) + return XDP_DROP; + + // Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len + // to bpf_tcp_raw_gen_syncookie and pass the verifier. + if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len)) + return XDP_ABORTED; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + if (hdr.ipv4) { + hdr.eth = data; + hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth); + // IPV4_MAXLEN is needed when calculating checksum. + // At least sizeof(struct iphdr) is needed here to access ihl. + if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end) + return XDP_ABORTED; + hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4; + } else if (hdr.ipv6) { + hdr.eth = data; + hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth); + hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6); + } else { + return XDP_ABORTED; + } + + if ((void *)hdr.tcp + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + // We run out of registers, tcp_len gets spilled to the stack, and the + // verifier forgets its min and max values checked above in tcp_dissect. + hdr.tcp_len = hdr.tcp->doff * 4; + if (hdr.tcp_len < sizeof(*hdr.tcp)) + return XDP_ABORTED; + + return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) : + syncookie_handle_ack(&hdr); +} + +SEC("xdp/dummy") +int dummy_xdp(struct xdp_md *ctx) +{ + // veth requires XDP programs to be set on both sides. + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/syncookie_test.sh b/samples/bpf/syncookie_test.sh new file mode 100755 index 000000000000..923f94a7d6f6 --- /dev/null +++ b/samples/bpf/syncookie_test.sh @@ -0,0 +1,55 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +set -e + +PORT=8080 + +DIR="$(dirname "$0")" +SERVER_PID= +MONITOR_PID= + +cleanup() { + set +e + [ -n "$SERVER_PID" ] && kill "$SERVER_PID" + [ -n "$MONITOR_PID" ] && kill "$MONITOR_PID" + ip link del tmp0 + ip netns del synproxy +} + +trap cleanup EXIT + +ip netns add synproxy +ip netns exec synproxy ip link set lo up +ip link add tmp0 type veth peer name tmp1 +sleep 1 # Wait, otherwise the IP address is not applied to tmp0. +ip link set tmp1 netns synproxy +ip link set tmp0 up +ip addr replace 198.18.0.1/24 dev tmp0 +ip netns exec synproxy ip link set tmp1 up +ip netns exec synproxy ip addr replace 198.18.0.2/24 dev tmp1 +ip netns exec synproxy sysctl -w net.ipv4.tcp_syncookies=2 +ip netns exec synproxy sysctl -w net.ipv4.tcp_timestamps=1 +ip netns exec synproxy sysctl -w net.netfilter.nf_conntrack_tcp_loose=0 +ip netns exec synproxy iptables -t raw -I PREROUTING \ + -i tmp1 -p tcp -m tcp --syn --dport "$PORT" -j CT --notrack +ip netns exec synproxy iptables -t filter -A INPUT \ + -i tmp1 -p tcp -m tcp --dport "$PORT" -m state --state INVALID,UNTRACKED \ + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460 +ip netns exec synproxy iptables -t filter -A INPUT \ + -i tmp1 -m state --state INVALID -j DROP +# When checksum offload is enabled, the XDP program sees wrong checksums and +# drops packets. +ethtool -K tmp0 tx off +# Workaround required for veth. +ip link set tmp0 xdp object "$DIR/syncookie_kern.o" section xdp/dummy +ip netns exec synproxy "$DIR/syncookie" --iface tmp1 --ports "$PORT" \ + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64 & +MONITOR_PID="$!" +ip netns exec synproxy python3 -m http.server "$PORT" & +SERVER_PID="$!" +echo "Waiting a few seconds for the server to start..." +sleep 5 +wget 'http://198.18.0.2:8080/' -O /dev/null -o /dev/null +sleep 1 # Wait for stats to appear. diff --git a/samples/bpf/syncookie_user.c b/samples/bpf/syncookie_user.c new file mode 100644 index 000000000000..dcb074405691 --- /dev/null +++ b/samples/bpf/syncookie_user.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <stdnoreturn.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <getopt.h> +#include <signal.h> +#include <sys/types.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <net/if.h> +#include <linux/if_link.h> +#include <linux/limits.h> + +static unsigned int ifindex; +static __u32 attached_prog_id; + +static void noreturn cleanup(int sig) +{ + DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts); + int prog_fd; + int err; + + if (attached_prog_id == 0) + exit(0); + + prog_fd = bpf_prog_get_fd_by_id(attached_prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + err = bpf_set_link_xdp_fd(ifindex, -1, 0); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } else { + opts.old_fd = prog_fd; + err = bpf_set_link_xdp_fd_opts(ifindex, -1, XDP_FLAGS_REPLACE, &opts); + close(prog_fd); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd_opts: %s\n", strerror(-err)); + // Not an error if already replaced by someone else. + if (err != -EEXIST) { + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } + } + exit(0); +} + +static noreturn void usage(const char *progname) +{ + fprintf(stderr, "Usage: %s [--iface <iface>|--prog <prog_id>] [--mss4 <mss ipv4> --mss6 <mss ipv6> --wscale <wscale> --ttl <ttl>] [--ports <port1>,<port2>,...]\n", + progname); + exit(1); +} + +static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigned long limit) +{ + unsigned long res; + char *endptr; + + errno = 0; + res = strtoul(arg, &endptr, 10); + if (errno != 0 || *endptr != '\0' || arg[0] == '\0' || res > limit) + usage(progname); + + return res; +} + +static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id, + __u64 *tcpipopts, char **ports) +{ + static struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "iface", required_argument, NULL, 'i' }, + { "prog", required_argument, NULL, 'x' }, + { "mss4", required_argument, NULL, 4 }, + { "mss6", required_argument, NULL, 6 }, + { "wscale", required_argument, NULL, 'w' }, + { "ttl", required_argument, NULL, 't' }, + { "ports", required_argument, NULL, 'p' }, + { NULL, 0, NULL, 0 }, + }; + unsigned long mss4, mss6, wscale, ttl; + unsigned int tcpipopts_mask = 0; + + if (argc < 2) + usage(argv[0]); + + *ifindex = 0; + *prog_id = 0; + *tcpipopts = 0; + *ports = 0; + + while (true) { + int opt; + + opt = getopt_long(argc, argv, "", long_options, NULL); + if (opt == -1) + break; + + switch (opt) { + case 'h': + usage(argv[0]); + break; + case 'i': + *ifindex = if_nametoindex(optarg); + if (*ifindex == 0) + usage(argv[0]); + break; + case 'x': + *prog_id = parse_arg_ul(argv[0], optarg, UINT32_MAX); + if (*prog_id == 0) + usage(argv[0]); + break; + case 4: + mss4 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 0; + break; + case 6: + mss6 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 1; + break; + case 'w': + wscale = parse_arg_ul(argv[0], optarg, 14); + tcpipopts_mask |= 1 << 2; + break; + case 't': + ttl = parse_arg_ul(argv[0], optarg, UINT8_MAX); + tcpipopts_mask |= 1 << 3; + break; + case 'p': + *ports = optarg; + break; + default: + usage(argv[0]); + } + } + if (optind < argc) + usage(argv[0]); + + if (tcpipopts_mask == 0xf) { + if (mss4 == 0 || mss6 == 0 || wscale == 0 || ttl == 0) + usage(argv[0]); + *tcpipopts = (mss6 << 32) | (ttl << 24) | (wscale << 16) | mss4; + } else if (tcpipopts_mask != 0) { + usage(argv[0]); + } + + if (*ifindex != 0 && *prog_id != 0) + usage(argv[0]); + if (*ifindex == 0 && *prog_id == 0) + usage(argv[0]); +} + +static int syncookie_attach(const char *argv0, unsigned int ifindex) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + char xdp_filename[PATH_MAX]; + struct bpf_object *obj; + int prog_fd; + int err; + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0); + err = bpf_prog_load(xdp_filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (err < 0) { + fprintf(stderr, "Error: bpf_prog_load: %s\n", strerror(-err)); + return err; + } + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err < 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + attached_prog_id = info.id; + signal(SIGINT, cleanup); + signal(SIGTERM, cleanup); + err = bpf_set_link_xdp_fd(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); + attached_prog_id = 0; + goto out; + } + err = 0; +out: + bpf_object__close(obj); + return err; +} + +static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd) +{ + struct bpf_prog_info prog_info; + __u32 map_ids[3]; + __u32 info_len; + int prog_fd; + int err; + int i; + + *values_map_fd = -1; + *ports_map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + return prog_fd; + } + + prog_info = (struct bpf_prog_info) { + .nr_map_ids = 3, + .map_ids = (__u64)map_ids, + }; + info_len = sizeof(prog_info); + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + + if (prog_info.type != BPF_PROG_TYPE_XDP) { + fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n"); + err = -ENOENT; + goto out; + } + if (prog_info.nr_map_ids != 2) { + fprintf(stderr, "Error: Found %u BPF maps, expected 2\n", + prog_info.nr_map_ids); + err = -ENOENT; + goto out; + } + + for (i = 0; i < prog_info.nr_map_ids; i++) { + struct bpf_map_info map_info = {}; + int map_fd; + + err = bpf_map_get_fd_by_id(map_ids[i]); + if (err < 0) { + fprintf(stderr, "Error: bpf_map_get_fd_by_id: %s\n", strerror(-err)); + goto err_close_map_fds; + } + map_fd = err; + + info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + close(map_fd); + goto err_close_map_fds; + } + if (strcmp(map_info.name, "values") == 0) { + *values_map_fd = map_fd; + continue; + } + if (strcmp(map_info.name, "allowed_ports") == 0) { + *ports_map_fd = map_fd; + continue; + } + close(map_fd); + goto err_close_map_fds; + } + + err = 0; + goto out; + +err_close_map_fds: + if (*values_map_fd != -1) + close(*values_map_fd); + if (*ports_map_fd != -1) + close(*ports_map_fd); + *values_map_fd = -1; + *ports_map_fd = -1; + +out: + close(prog_fd); + return err; +} + +int main(int argc, char *argv[]) +{ + int values_map_fd, ports_map_fd; + __u64 tcpipopts; + bool firstiter; + __u64 prevcnt; + __u32 prog_id; + char *ports; + int err = 0; + + parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports); + + if (prog_id == 0) { + err = bpf_get_link_xdp_id(ifindex, &prog_id, 0); + if (err < 0) { + fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err)); + goto out; + } + if (prog_id == 0) { + err = syncookie_attach(argv[0], ifindex); + if (err < 0) + goto out; + prog_id = attached_prog_id; + } + } + + err = syncookie_open_bpf_maps(prog_id, &values_map_fd, &ports_map_fd); + if (err < 0) + goto out; + + if (ports) { + __u16 port_last = 0; + __u32 port_idx = 0; + char *p = ports; + + fprintf(stderr, "Replacing allowed ports\n"); + + while (p && *p != '\0') { + char *token = strsep(&p, ","); + __u16 port; + + port = parse_arg_ul(argv[0], token, UINT16_MAX); + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add port %u (index %u)\n", + port, port_idx); + goto out_close_maps; + } + fprintf(stderr, "Added port %u\n", port); + port_idx++; + } + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port_last, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add the terminator value 0 (index %u)\n", + port_idx); + goto out_close_maps; + } + } + + if (tcpipopts) { + __u32 key = 0; + + fprintf(stderr, "Replacing TCP/IP options\n"); + + err = bpf_map_update_elem(values_map_fd, &key, &tcpipopts, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + } + + if ((ports || tcpipopts) && attached_prog_id == 0) + goto out_close_maps; + + prevcnt = 0; + firstiter = true; + while (true) { + __u32 key = 1; + __u64 value; + + err = bpf_map_lookup_elem(values_map_fd, &key, &value); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_lookup_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + if (firstiter) { + prevcnt = value; + firstiter = false; + } + printf("SYNACKs generated: %llu (total %llu)\n", value - prevcnt, value); + prevcnt = value; + sleep(1); + } + +out_close_maps: + close(values_map_fd); + close(ports_map_fd); +out: + return err == 0 ? 0 : 1; +} -- 2.30.2