[PATCH bpf-next 07/10] bpf: Add helpers to query conntrack info

Maxim Mikityanskiy <maximmi@xxxxxxxxxx> · Tue, 19 Oct 2021 17:46:52 +0300

The new helpers (bpf_ct_lookup_tcp and bpf_ct_lookup_udp) allow to query
connection tracking information of TCP and UDP connections based on
source and destination IP address and port. The helper returns a pointer
to struct nf_conn (if the conntrack entry was found), which needs to be
released with bpf_ct_release.

Signed-off-by: Maxim Mikityanskiy <maximmi@xxxxxxxxxx>
Reviewed-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
 include/uapi/linux/bpf.h       |  81 +++++++++++++
 kernel/bpf/verifier.c          |   9 +-
 net/core/filter.c              | 205 +++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  81 +++++++++++++
 4 files changed, 373 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a10a44c4f79b..883de3f1bb8b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4925,6 +4925,79 @@ union bpf_attr {
  *	Return
  *		The number of bytes written to the buffer, or a negative error
  *		in case of failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 *flags_err)
+ *	Description
+ *		Look for conntrack info for a TCP connection matching *tuple*,
+ *		optionally in a child network namespace *netns*.
+ *
+ *		The *flags_err* argument is used as an input parameter for flags
+ *		and output parameter for the error code. The flags can be a
+ *		combination of one or more of the following values:
+ *
+ *		**BPF_F_CT_DIR_REPLY**
+ *			When set, the conntrack direction is IP_CT_DIR_REPLY,
+ *			otherwise IP_CT_DIR_ORIGINAL.
+ *
+ *		If the function returns **NULL**, *flags_err* will indicate the
+ *		error code:
+ *
+ *		**EAFNOSUPPORT**
+ *			*tuple_size* doesn't match supported address families
+ *			(AF_INET; AF_INET6 when CONFIG_IPV6 is enabled).
+ *
+ *		**EINVAL**
+ *			Input arguments are not valid.
+ *
+ *		**ENOENT**
+ *			Connection tracking entry for *tuple* wasn't found.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK** configuration option as built-in.
+ *	Return
+ *		Connection tracking status (see **enum ip_conntrack_status**),
+ *		or **NULL** in case of failure or if there is no conntrack entry
+ *		for this tuple.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 *flags_err)
+ *	Description
+ *		Look for conntrack info for a UDP connection matching *tuple*,
+ *		optionally in a child network namespace *netns*.
+ *
+ *		The *flags_err* argument is used as an input parameter for flags
+ *		and output parameter for the error code. The flags can be a
+ *		combination of one or more of the following values:
+ *
+ *		**BPF_F_CT_DIR_REPLY**
+ *			When set, the conntrack direction is IP_CT_DIR_REPLY,
+ *			otherwise IP_CT_DIR_ORIGINAL.
+ *
+ *		If the function returns **NULL**, *flags_err* will indicate the
+ *		error code:
+ *
+ *		**EAFNOSUPPORT**
+ *			*tuple_size* doesn't match supported address families
+ *			(AF_INET; AF_INET6 when CONFIG_IPV6 is enabled).
+ *
+ *		**EINVAL**
+ *			Input arguments are not valid.
+ *
+ *		**ENOENT**
+ *			Connection tracking entry for *tuple* wasn't found.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK** configuration option as built-in.
+ *	Return
+ *		Connection tracking status (see **enum ip_conntrack_status**),
+ *		or **NULL** in case of failure or if there is no conntrack entry
+ *		for this tuple.
+ *
+ * long bpf_ct_release(void *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a non-**NULL**
+ *		pointer that was returned from **bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5105,6 +5178,9 @@ union bpf_attr {
 	FN(task_pt_regs),		\
 	FN(get_branch_snapshot),	\
 	FN(trace_vprintk),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5288,6 +5364,11 @@ enum {
 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
 };
 
+/* Flags for bpf_ct_lookup_{tcp,udp} helpers. */
+enum {
+	BPF_F_CT_DIR_REPLY	= (1ULL << 0),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6eafef35e247..23e2a23ca9c4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -506,7 +506,8 @@ static bool is_release_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_release ||
 	       func_id == BPF_FUNC_ringbuf_submit ||
-	       func_id == BPF_FUNC_ringbuf_discard;
+	       func_id == BPF_FUNC_ringbuf_discard ||
+	       func_id == BPF_FUNC_ct_release;
 }
 
 static bool may_be_acquire_function(enum bpf_func_id func_id)
@@ -515,7 +516,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_sk_lookup_udp ||
 		func_id == BPF_FUNC_skc_lookup_tcp ||
 		func_id == BPF_FUNC_map_lookup_elem ||
-	        func_id == BPF_FUNC_ringbuf_reserve;
+		func_id == BPF_FUNC_ringbuf_reserve ||
+		func_id == BPF_FUNC_ct_lookup_tcp;
 }
 
 static bool is_acquire_function(enum bpf_func_id func_id,
@@ -526,7 +528,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
 	if (func_id == BPF_FUNC_sk_lookup_tcp ||
 	    func_id == BPF_FUNC_sk_lookup_udp ||
 	    func_id == BPF_FUNC_skc_lookup_tcp ||
-	    func_id == BPF_FUNC_ringbuf_reserve)
+	    func_id == BPF_FUNC_ringbuf_reserve ||
+	    func_id == BPF_FUNC_ct_lookup_tcp)
 		return true;
 
 	if (func_id == BPF_FUNC_map_lookup_elem &&
diff --git a/net/core/filter.c b/net/core/filter.c
index d2d07ccae599..f913851c97f7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -79,6 +79,7 @@
 #include <net/tls.h>
 #include <net/xdp.h>
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -7096,6 +7097,194 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+static struct nf_conn *bpf_ct_lookup(struct net *caller_net,
+				     struct bpf_sock_tuple *tuple,
+				     u32 tuple_len,
+				     u8 protonum,
+				     u64 netns_id,
+				     u64 flags)
+{
+	struct nf_conntrack_tuple ct_tuple = {};
+	struct nf_conntrack_tuple_hash *found;
+	struct net *net;
+	u8 direction;
+
+	direction = flags & BPF_F_CT_DIR_REPLY ? IP_CT_DIR_REPLY :
+						 IP_CT_DIR_ORIGINAL;
+
+	if (flags & ~BPF_F_CT_DIR_REPLY)
+		return ERR_PTR(-EINVAL);
+
+	if (tuple_len == sizeof(tuple->ipv4)) {
+		ct_tuple.src.l3num = AF_INET;
+		ct_tuple.src.u3.ip = tuple->ipv4.saddr;
+		ct_tuple.src.u.tcp.port = tuple->ipv4.sport;
+		ct_tuple.dst.u3.ip = tuple->ipv4.daddr;
+		ct_tuple.dst.u.tcp.port = tuple->ipv4.dport;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (tuple_len == sizeof(tuple->ipv6)) {
+		ct_tuple.src.l3num = AF_INET6;
+		memcpy(ct_tuple.src.u3.ip6, tuple->ipv6.saddr,
+		       sizeof(tuple->ipv6.saddr));
+		ct_tuple.src.u.tcp.port = tuple->ipv6.sport;
+		memcpy(ct_tuple.dst.u3.ip6, tuple->ipv6.daddr,
+		       sizeof(tuple->ipv6.daddr));
+		ct_tuple.dst.u.tcp.port = tuple->ipv6.dport;
+#endif
+	} else {
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	ct_tuple.dst.protonum = protonum;
+	ct_tuple.dst.dir = direction;
+
+	net = caller_net;
+	if ((s32)netns_id >= 0) {
+		if (unlikely(netns_id > S32_MAX))
+			return ERR_PTR(-EINVAL);
+		net = get_net_ns_by_id(net, netns_id);
+		if (!net)
+			return ERR_PTR(-EINVAL);
+	}
+
+	found = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &ct_tuple);
+
+	if ((s32)netns_id >= 0)
+		put_net(net);
+
+	if (!found)
+		return ERR_PTR(-ENOENT);
+	return nf_ct_tuplehash_to_ctrack(found);
+}
+
+BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx,
+	   struct bpf_sock_tuple *, tuple, u32, tuple_len,
+	   u64, netns_id, u64 *, flags_err)
+{
+	struct nf_conn *ct;
+
+	ct = bpf_ct_lookup(dev_net(ctx->rxq->dev), tuple, tuple_len,
+			   IPPROTO_TCP, netns_id, *flags_err);
+	if (IS_ERR(ct)) {
+		*flags_err = PTR_ERR(ct);
+		return (unsigned long)NULL;
+	}
+	return (unsigned long)ct;
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = {
+	.func		= bpf_xdp_ct_lookup_tcp,
+	.gpl_only	= true, /* nf_conntrack_find_get is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx,
+	   struct bpf_sock_tuple *, tuple, u32, tuple_len,
+	   u64, netns_id, u64 *, flags_err)
+{
+	struct nf_conn *ct;
+
+	ct = bpf_ct_lookup(dev_net(ctx->rxq->dev), tuple, tuple_len,
+			   IPPROTO_UDP, netns_id, *flags_err);
+	if (IS_ERR(ct)) {
+		*flags_err = PTR_ERR(ct);
+		return (unsigned long)NULL;
+	}
+	return (unsigned long)ct;
+}
+
+static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = {
+	.func		= bpf_xdp_ct_lookup_udp,
+	.gpl_only	= true, /* nf_conntrack_find_get is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_5(bpf_skb_ct_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, tuple_len,
+	   u64, netns_id, u64 *, flags_err)
+{
+	struct net *caller_net;
+	struct nf_conn *ct;
+
+	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	ct = bpf_ct_lookup(caller_net, tuple, tuple_len, IPPROTO_TCP,
+			   netns_id, *flags_err);
+	if (IS_ERR(ct)) {
+		*flags_err = PTR_ERR(ct);
+		return (unsigned long)NULL;
+	}
+	return (unsigned long)ct;
+}
+
+static const struct bpf_func_proto bpf_skb_ct_lookup_tcp_proto = {
+	.func		= bpf_skb_ct_lookup_tcp,
+	.gpl_only	= true, /* nf_conntrack_find_get is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_5(bpf_skb_ct_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, tuple_len,
+	   u64, netns_id, u64 *, flags_err)
+{
+	struct net *caller_net;
+	struct nf_conn *ct;
+
+	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	ct = bpf_ct_lookup(caller_net, tuple, tuple_len, IPPROTO_UDP,
+			   netns_id, *flags_err);
+	if (IS_ERR(ct)) {
+		*flags_err = PTR_ERR(ct);
+		return (unsigned long)NULL;
+	}
+	return (unsigned long)ct;
+}
+
+static const struct bpf_func_proto bpf_skb_ct_lookup_udp_proto = {
+	.func		= bpf_skb_ct_lookup_udp,
+	.gpl_only	= true, /* nf_conntrack_find_get is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_NF_CONN_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct)
+{
+	nf_ct_put(ct);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_ct_release_proto = {
+	.func		= bpf_ct_release,
+	.gpl_only	= false,
+	.pkt_access	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_NF_CONN,
+};
+#endif
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -7455,6 +7644,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_gen_syncookie_proto;
 	case BPF_FUNC_sk_assign:
 		return &bpf_sk_assign_proto;
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_skb_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_skb_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
@@ -7498,6 +7695,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
+#if IS_BUILTIN(CONFIG_NF_CONNTRACK)
+	case BPF_FUNC_ct_lookup_tcp:
+		return &bpf_xdp_ct_lookup_tcp_proto;
+	case BPF_FUNC_ct_lookup_udp:
+		return &bpf_xdp_ct_lookup_udp_proto;
+	case BPF_FUNC_ct_release:
+		return &bpf_ct_release_proto;
+#endif
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a10a44c4f79b..883de3f1bb8b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4925,6 +4925,79 @@ union bpf_attr {
  *	Return
  *		The number of bytes written to the buffer, or a negative error
  *		in case of failure.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 *flags_err)
+ *	Description
+ *		Look for conntrack info for a TCP connection matching *tuple*,
+ *		optionally in a child network namespace *netns*.
+ *
+ *		The *flags_err* argument is used as an input parameter for flags
+ *		and output parameter for the error code. The flags can be a
+ *		combination of one or more of the following values:
+ *
+ *		**BPF_F_CT_DIR_REPLY**
+ *			When set, the conntrack direction is IP_CT_DIR_REPLY,
+ *			otherwise IP_CT_DIR_ORIGINAL.
+ *
+ *		If the function returns **NULL**, *flags_err* will indicate the
+ *		error code:
+ *
+ *		**EAFNOSUPPORT**
+ *			*tuple_size* doesn't match supported address families
+ *			(AF_INET; AF_INET6 when CONFIG_IPV6 is enabled).
+ *
+ *		**EINVAL**
+ *			Input arguments are not valid.
+ *
+ *		**ENOENT**
+ *			Connection tracking entry for *tuple* wasn't found.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK** configuration option as built-in.
+ *	Return
+ *		Connection tracking status (see **enum ip_conntrack_status**),
+ *		or **NULL** in case of failure or if there is no conntrack entry
+ *		for this tuple.
+ *
+ * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 *flags_err)
+ *	Description
+ *		Look for conntrack info for a UDP connection matching *tuple*,
+ *		optionally in a child network namespace *netns*.
+ *
+ *		The *flags_err* argument is used as an input parameter for flags
+ *		and output parameter for the error code. The flags can be a
+ *		combination of one or more of the following values:
+ *
+ *		**BPF_F_CT_DIR_REPLY**
+ *			When set, the conntrack direction is IP_CT_DIR_REPLY,
+ *			otherwise IP_CT_DIR_ORIGINAL.
+ *
+ *		If the function returns **NULL**, *flags_err* will indicate the
+ *		error code:
+ *
+ *		**EAFNOSUPPORT**
+ *			*tuple_size* doesn't match supported address families
+ *			(AF_INET; AF_INET6 when CONFIG_IPV6 is enabled).
+ *
+ *		**EINVAL**
+ *			Input arguments are not valid.
+ *
+ *		**ENOENT**
+ *			Connection tracking entry for *tuple* wasn't found.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NF_CONNTRACK** configuration option as built-in.
+ *	Return
+ *		Connection tracking status (see **enum ip_conntrack_status**),
+ *		or **NULL** in case of failure or if there is no conntrack entry
+ *		for this tuple.
+ *
+ * long bpf_ct_release(void *ct)
+ *	Description
+ *		Release the reference held by *ct*. *ct* must be a non-**NULL**
+ *		pointer that was returned from **bpf_ct_lookup_xxx**\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5105,6 +5178,9 @@ union bpf_attr {
 	FN(task_pt_regs),		\
 	FN(get_branch_snapshot),	\
 	FN(trace_vprintk),		\
+	FN(ct_lookup_tcp),		\
+	FN(ct_lookup_udp),		\
+	FN(ct_release),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5288,6 +5364,11 @@ enum {
 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
 };
 
+/* Flags for bpf_ct_lookup_{tcp,udp} helpers. */
+enum {
+	BPF_F_CT_DIR_REPLY	= (1ULL << 0),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
-- 
2.30.2