This patch adds a new kfunc available at TC hook to support arbitrary SYN Cookie. The basic usage is as follows: struct tcp_cookie_attributes attr = { .tcp_opt = { .mss_clamp = mss, .wscale_ok = wscale_ok, .snd_scale = send_scale, /* < 15 */ .tstamp_ok = tstamp_ok, .sack_ok = sack_ok, }, .ecn_ok = ecn_ok, .usec_ts_ok = usec_ts_ok, }; skc = bpf_skc_lookup_tcp(...); sk = (struct sock *)bpf_skc_to_tcp_sock(skc); bpf_sk_assign_tcp_reqsk(skb, sk, attr, sizeof(attr)); bpf_sk_release(skc); bpf_sk_assign_tcp_reqsk() takes skb, a listener sk, and struct tcp_cookie_attributes and allocates reqsk and configures it. Then, bpf_sk_assign_tcp_reqsk() links reqsk with skb and the listener. The notable thing here is that we do not hold refcnt for both reqsk and listener. To differentiate that, we mark reqsk->syncookie, which is only used in TX for now. So, if reqsk->syncookie is 1 in RX, it means that the reqsk is allocated by kfunc. When skb is freed, sock_pfree() checks if reqsk->syncookie is 1, and in that case, we set NULL to reqsk->rsk_listener before calling reqsk_free() as reqsk does not hold a refcnt of the listener. When the TCP stack looks up a socket from the skb, we return inet_reqsk(skb->sk)->rsk_listener in inet6?_steal_sock(). However, we do not clear skb->sk and skb->destructor so that we can carry the reqsk to cookie_v[46]_check(). The refcnt of reqsk will finally be set to 1 in tcp_get_cookie_sock() after creating a full sk. Note that we can extend struct tcp_cookie_attributes in the future when we add a new attribute that is determined in 3WHS. Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxx> --- include/net/inet6_hashtables.h | 16 +++++- include/net/inet_hashtables.h | 16 +++++- include/net/tcp.h | 6 +++ net/core/filter.c | 98 +++++++++++++++++++++++++++++++++- net/core/sock.c | 14 ++++- 5 files changed, 144 insertions(+), 6 deletions(-) diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 533a7337865a..9a67f47a5e64 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -116,9 +116,23 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, if (!sk) return NULL; - if (!prefetched || !sk_fullsock(sk)) + if (!prefetched) return sk; + if (sk->sk_state == TCP_NEW_SYN_RECV) { +#if IS_ENABLED(CONFIG_SYN_COOKIE) + if (inet_reqsk(sk)->syncookie) { + *refcounted = false; + skb->sk = sk; + skb->destructor = sock_pfree; + return inet_reqsk(sk)->rsk_listener; + } +#endif + return sk; + } else if (sk->sk_state == TCP_TIME_WAIT) { + return sk; + } + if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3ecfeadbfa06..36609656a047 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -462,9 +462,23 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, if (!sk) return NULL; - if (!prefetched || !sk_fullsock(sk)) + if (!prefetched) return sk; + if (sk->sk_state == TCP_NEW_SYN_RECV) { +#if IS_ENABLED(CONFIG_SYN_COOKIE) + if (inet_reqsk(sk)->syncookie) { + *refcounted = false; + skb->sk = sk; + skb->destructor = sock_pfree; + return inet_reqsk(sk)->rsk_listener; + } +#endif + return sk; + } else if (sk->sk_state == TCP_TIME_WAIT) { + return sk; + } + if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; diff --git a/include/net/tcp.h b/include/net/tcp.h index 842791997f30..373afcfaefa6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -591,6 +591,12 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * } #if IS_ENABLED(CONFIG_BPF) +struct tcp_cookie_attributes { + struct tcp_options_received tcp_opt; + bool ecn_ok; + bool usec_ts_ok; +} __packed; + static inline bool cookie_bpf_ok(struct sk_buff *skb) { return skb->sk; diff --git a/net/core/filter.c b/net/core/filter.c index d64baa7ac6cd..7beba469e8a7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11807,6 +11807,90 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } + +#if IS_ENABLED(CONFIG_SYN_COOKIE) +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, + struct tcp_cookie_attributes *attr, + int attr__sz) +{ + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + __u16 min_mss; + + if (attr__sz != sizeof(*attr)) + return -EINVAL; + + if (!sk) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + if (dev_net(skb->dev) != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (attr->tcp_opt.mss_clamp < min_mss) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + return -EINVAL; + } + + if (attr->tcp_opt.wscale_ok && + attr->tcp_opt.snd_wscale > TCP_MAX_WSCALE) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + return -EINVAL; + } + + if (sk_is_mptcp(sk)) + req = mptcp_subflow_reqsk_alloc(ops, sk, false); + else + req = inet_reqsk_alloc(ops, sk, false); + + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->syncookie = 1; + req->rsk_listener = sk; + req->mss = attr->tcp_opt.mss_clamp; + + ireq->snd_wscale = attr->tcp_opt.snd_wscale; + ireq->wscale_ok = attr->tcp_opt.wscale_ok; + ireq->tstamp_ok = attr->tcp_opt.tstamp_ok; + ireq->sack_ok = attr->tcp_opt.sack_ok; + ireq->ecn_ok = attr->ecn_ok; + + treq->req_usec_ts = attr->usec_ts_ok; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +} +#endif + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, @@ -11835,6 +11919,10 @@ BTF_SET8_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) +BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -11850,6 +11938,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .set = &bpf_kfunc_check_set_sock_addr, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11865,8 +11958,9 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa..998950e97dfe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2579,8 +2579,18 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) + return; + + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } + + sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -- 2.30.2