[PATCH RFC bpf-next v1 5/6] net: netfilter: Add unstable CT lookup helper for XDP and TC-BPF

Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> · Sat, 30 Oct 2021 20:16:08 +0530

This change adds conntrack lookup helpers using the unstable kfunc call
interface for the XDP and TC-BPF hooks.

Also add acquire/release functions (randomly returning NULL), and also
exercise the RET_PTR_TO_BTF_ID_OR_NULL path so that BPF program caller
has to check for NULL before dereferencing the pointer, for the TC hook.
These will be used in selftest.

Export get_net_ns_by_id and btf_type_by_id as nf_conntrack needs to call
them.

[ NOTE: Currently the btf_type check does not work, due to the problem
described in the thread mentioned in the comments ]

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
---
 include/linux/bpf.h               |  22 +++
 include/linux/btf.h               |   1 +
 kernel/bpf/btf.c                  |   2 +
 net/bpf/test_run.c                |  55 +++++++
 net/core/filter.c                 |  56 +++++++
 net/core/net_namespace.c          |   1 +
 net/netfilter/nf_conntrack_core.c | 255 ++++++++++++++++++++++++++++++
 7 files changed, 392 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6fb34f0a2758..b4f82aaf68bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1647,6 +1647,10 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
 bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner);
+bool bpf_prog_test_is_acquire_kfunc(u32 kfunc_id, struct module *owner);
+bool bpf_prog_test_is_release_kfunc(u32 kfunc_id, struct module *owner);
+enum bpf_return_type bpf_prog_test_get_kfunc_return_type(u32 kfunc_id,
+							 struct module *owner);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1874,6 +1878,24 @@ static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id,
 	return false;
 }
 
+static inline bool bpf_prog_test_is_acquire_kfunc(u32 kfunc_id,
+						  struct module *owner)
+{
+	return false;
+}
+
+static inline bool bpf_prog_test_is_release_kfunc(u32 kfunc_id,
+						  struct module *owner)
+{
+	return false;
+}
+
+static inline enum bpf_return_type
+bpf_prog_test_get_kfunc_return_type(u32 kfunc_id, struct module *owner)
+{
+	return __BPF_RET_TYPE_MAX;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 464f22bf7d5f..87019548e37c 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -321,5 +321,6 @@ static inline int bpf_btf_mod_struct_access(struct kfunc_btf_id_list *klist,
 
 extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
 extern struct kfunc_btf_id_list prog_test_kfunc_list;
+extern struct kfunc_btf_id_list xdp_kfunc_list;
 
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8b3c15f4359d..d0e5101b8c2d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -735,6 +735,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 		return NULL;
 	return btf->types[type_id];
 }
+EXPORT_SYMBOL_GPL(btf_type_by_id);
 
 /*
  * Regular int is not a bit field and it must be either
@@ -6502,3 +6503,4 @@ int bpf_btf_mod_struct_access(struct kfunc_btf_id_list *klist,
 
 DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
 DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+DEFINE_KFUNC_BTF_ID_LIST(xdp_kfunc_list);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 46dd95755967..a678ddc97e0f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -232,6 +232,28 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
 	return sk;
 }
 
+struct prog_test_ref_kfunc {
+	int a;
+	int b;
+};
+
+static struct prog_test_ref_kfunc prog_test_struct;
+
+noinline struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(char *ptr)
+{
+	/* randomly return NULL */
+	if (get_jiffies_64() % 2)
+		return NULL;
+	prog_test_struct.a = 42;
+	prog_test_struct.b = 108;
+	return &prog_test_struct;
+}
+
+noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+{
+	return;
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
@@ -240,8 +262,14 @@ BTF_SET_START(test_sk_kfunc_ids)
 BTF_ID(func, bpf_kfunc_call_test1)
 BTF_ID(func, bpf_kfunc_call_test2)
 BTF_ID(func, bpf_kfunc_call_test3)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_ID(func, bpf_kfunc_call_test_release)
 BTF_SET_END(test_sk_kfunc_ids)
 
+BTF_ID_LIST(test_sk_acq_rel)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_ID(func, bpf_kfunc_call_test_release)
+
 bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner)
 {
 	if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id))
@@ -249,6 +277,33 @@ bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner)
 	return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner);
 }
 
+bool bpf_prog_test_is_acquire_kfunc(u32 kfunc_id, struct module *owner)
+{
+	if (!owner) /* bpf_kfunc_call_test_acquire */
+		return kfunc_id == test_sk_acq_rel[0];
+	return bpf_is_mod_acquire_kfunc(&prog_test_kfunc_list, kfunc_id, owner);
+}
+
+bool bpf_prog_test_is_release_kfunc(u32 kfunc_id, struct module *owner)
+{
+	if (!owner) /* bpf_kfunc_call_test_release */
+		return kfunc_id == test_sk_acq_rel[1];
+	return bpf_is_mod_release_kfunc(&prog_test_kfunc_list, kfunc_id, owner);
+}
+
+enum bpf_return_type bpf_prog_test_get_kfunc_return_type(u32 kfunc_id,
+							 struct module *owner)
+{
+	if (!owner) {
+		if (kfunc_id == test_sk_acq_rel[0])
+			return RET_PTR_TO_BTF_ID_OR_NULL;
+		else
+			return __BPF_RET_TYPE_MAX;
+	}
+	return bpf_get_mod_kfunc_return_type(&prog_test_kfunc_list, kfunc_id,
+					     owner);
+}
+
 static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 			   u32 headroom, u32 tailroom)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e8d3b49c297..4e320de4472d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9948,6 +9948,12 @@ const struct bpf_prog_ops sk_filter_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+				 const struct btf *btf,
+				 const struct btf_type *t, int off,
+				 int size, enum bpf_access_type atype,
+				 u32 *next_btf_id);
+
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
@@ -9955,17 +9961,67 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
 	.check_kfunc_call	= bpf_prog_test_check_kfunc_call,
+	.is_acquire_kfunc	= bpf_prog_test_is_acquire_kfunc,
+	.is_release_kfunc	= bpf_prog_test_is_release_kfunc,
+	.get_kfunc_return_type  = bpf_prog_test_get_kfunc_return_type,
+	/* resuse the callback, there is nothing xdp specific in it */
+	.btf_struct_access      = xdp_btf_struct_access,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+static bool xdp_is_acquire_kfunc(u32 kfunc_id, struct module *owner)
+{
+	return bpf_is_mod_acquire_kfunc(&xdp_kfunc_list, kfunc_id, owner);
+}
+
+static bool xdp_is_release_kfunc(u32 kfunc_id, struct module *owner)
+{
+	return bpf_is_mod_release_kfunc(&xdp_kfunc_list, kfunc_id, owner);
+}
+
+static enum bpf_return_type xdp_get_kfunc_return_type(u32 kfunc_id,
+						      struct module *owner)
+{
+	return bpf_get_mod_kfunc_return_type(&xdp_kfunc_list, kfunc_id, owner);
+}
+
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+				 const struct btf *btf,
+				 const struct btf_type *t, int off,
+				 int size, enum bpf_access_type atype,
+				 u32 *next_btf_id)
+{
+	int ret = __BPF_REG_TYPE_MAX;
+	struct module *mod;
+
+	if (atype != BPF_READ)
+		return -EACCES;
+
+	if (btf_is_module(btf)) {
+		mod = btf_try_get_module(btf);
+		if (!mod)
+			return -ENXIO;
+		ret = bpf_btf_mod_struct_access(&xdp_kfunc_list, mod, log, btf, t, off, size,
+						atype, next_btf_id);
+		module_put(mod);
+	}
+	if (ret == __BPF_REG_TYPE_MAX)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+	return ret;
+}
+
 const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
 	.gen_prologue		= bpf_noop_prologue,
+	.is_acquire_kfunc	= xdp_is_acquire_kfunc,
+	.is_release_kfunc	= xdp_is_release_kfunc,
+	.get_kfunc_return_type  = xdp_get_kfunc_return_type,
+	.btf_struct_access	= xdp_btf_struct_access,
 };
 
 const struct bpf_prog_ops xdp_prog_ops = {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 202fa5eacd0f..7b4bfe793002 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -299,6 +299,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
 
 	return peer;
 }
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
 
 /*
  * setup_net runs the initializers for the network namespace object.
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 770a63103c7a..69450b5c32f0 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -11,6 +11,9 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
@@ -2451,6 +2454,252 @@ static int kill_all(struct nf_conn *i, void *data)
 	return net_eq(nf_ct_net(i), data);
 }
 
+/* Unstable Kernel Helpers for XDP hook */
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+					  struct bpf_sock_tuple *bpf_tuple,
+					  u32 tuple_len, u8 protonum,
+					  u64 netns_id, u64 flags)
+{
+	struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple tuple;
+
+	if (flags != IP_CT_DIR_ORIGINAL && flags != IP_CT_DIR_REPLY)
+		return ERR_PTR(-EINVAL);
+
+	memset(&tuple, 0, sizeof(tuple));
+
+	switch (tuple_len) {
+	case sizeof(bpf_tuple->ipv4):
+		tuple.src.l3num = AF_INET;
+		tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
+		tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
+		tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
+		tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+		break;
+	case sizeof(bpf_tuple->ipv6):
+		tuple.src.l3num = AF_INET6;
+		memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+		tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
+		memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+		tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+		break;
+	default:
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	tuple.dst.protonum = protonum;
+	tuple.dst.dir = flags;
+
+	if ((s32)netns_id >= 0) {
+		if ((s32)netns_id > S32_MAX)
+			return ERR_PTR(-EINVAL);
+		net = get_net_ns_by_id(net, netns_id);
+		if (!net)
+			return ERR_PTR(-EINVAL);
+	}
+
+	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+	if ((s32)netns_id >= 0)
+		put_net(net);
+	if (!hash)
+		return ERR_PTR(-ENOENT);
+	return nf_ct_tuplehash_to_ctrack(hash);
+}
+
+static struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
+					 struct bpf_sock_tuple *bpf_tuple,
+					 u32 tuple_len, u8 protonum,
+					 u64 netns_id, u64 *flags_err)
+{
+	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+	struct net *caller_net;
+	struct nf_conn *nfct;
+
+	if (!flags_err)
+		return NULL;
+	if (!bpf_tuple) {
+		*flags_err = -EINVAL;
+		return NULL;
+	}
+	caller_net = dev_net(ctx->rxq->dev);
+	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple_len, protonum,
+				  netns_id, *flags_err);
+	if (IS_ERR(nfct)) {
+		*flags_err = PTR_ERR(nfct);
+		return NULL;
+	}
+	return nfct;
+}
+
+static struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
+					 struct bpf_sock_tuple *bpf_tuple,
+					 u32 tuple_len, u8 protonum,
+					 u64 netns_id, u64 *flags_err)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct net *caller_net;
+	struct nf_conn *nfct;
+
+	if (!flags_err)
+		return NULL;
+	if (!bpf_tuple) {
+		*flags_err = -EINVAL;
+		return NULL;
+	}
+	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple_len, protonum,
+				  netns_id, *flags_err);
+	if (IS_ERR(nfct)) {
+		*flags_err = PTR_ERR(nfct);
+		return NULL;
+	}
+	return nfct;
+}
+
+struct nf_conn *bpf_xdp_ct_lookup_tcp(struct xdp_md *xdp_ctx,
+				      struct bpf_sock_tuple *bpf_tuple,
+				      u32 tuple_len, u64 netns_id,
+				      u64 *flags_err)
+{
+	return bpf_xdp_ct_lookup(xdp_ctx, bpf_tuple, tuple_len, IPPROTO_TCP,
+				 netns_id, flags_err);
+}
+
+struct nf_conn *bpf_xdp_ct_lookup_udp(struct xdp_md *xdp_ctx,
+				      struct bpf_sock_tuple *bpf_tuple,
+				      u32 tuple_len, u64 netns_id,
+				      u64 *flags_err)
+{
+	return bpf_xdp_ct_lookup(xdp_ctx, bpf_tuple, tuple_len, IPPROTO_UDP,
+				 netns_id, flags_err);
+}
+
+struct nf_conn *bpf_skb_ct_lookup_tcp(struct __sk_buff *skb_ctx,
+				      struct bpf_sock_tuple *bpf_tuple,
+				      u32 tuple_len, u64 netns_id,
+				      u64 *flags_err)
+{
+	return bpf_skb_ct_lookup(skb_ctx, bpf_tuple, tuple_len, IPPROTO_TCP,
+				 netns_id, flags_err);
+}
+
+struct nf_conn *bpf_skb_ct_lookup_udp(struct __sk_buff *skb_ctx,
+				      struct bpf_sock_tuple *bpf_tuple,
+				      u32 tuple_len, u64 netns_id,
+				      u64 *flags_err)
+{
+	return bpf_skb_ct_lookup(skb_ctx, bpf_tuple, tuple_len, IPPROTO_UDP,
+				 netns_id, flags_err);
+}
+
+void bpf_ct_release(struct nf_conn *nfct)
+{
+	if (!nfct)
+		return;
+	nf_ct_put(nfct);
+}
+
+BTF_SET_START(nf_conntrack_xdp_ids)
+BTF_ID(func, bpf_xdp_ct_lookup_tcp)
+BTF_ID(func, bpf_xdp_ct_lookup_udp)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_conntrack_xdp_ids)
+
+BTF_SET_START(nf_conntrack_skb_ids)
+BTF_ID(func, bpf_skb_ct_lookup_tcp)
+BTF_ID(func, bpf_skb_ct_lookup_udp)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_conntrack_skb_ids)
+
+BTF_ID_LIST(nf_conntrack_ids)
+BTF_ID(func, bpf_xdp_ct_lookup_tcp)
+BTF_ID(func, bpf_xdp_ct_lookup_udp)
+BTF_ID(func, bpf_skb_ct_lookup_tcp)
+BTF_ID(func, bpf_skb_ct_lookup_udp)
+BTF_ID(func, bpf_ct_release)
+BTF_ID(struct, nf_conn)
+
+bool nf_is_acquire_kfunc(u32 kfunc_id)
+{
+	return kfunc_id == nf_conntrack_ids[0] ||
+	       kfunc_id == nf_conntrack_ids[1] ||
+	       kfunc_id == nf_conntrack_ids[2] ||
+	       kfunc_id == nf_conntrack_ids[3];
+}
+
+bool nf_is_release_kfunc(u32 kfunc_id)
+{
+	return kfunc_id == nf_conntrack_ids[4];
+}
+
+enum bpf_return_type nf_get_kfunc_return_type(u32 kfunc_id)
+{
+	if (kfunc_id == nf_conntrack_ids[0] ||
+	    kfunc_id == nf_conntrack_ids[1] ||
+	    kfunc_id == nf_conntrack_ids[2] ||
+	    kfunc_id == nf_conntrack_ids[3])
+		return RET_PTR_TO_BTF_ID_OR_NULL;
+	return __BPF_RET_TYPE_MAX;
+}
+
+static int nf_btf_struct_access(struct bpf_verifier_log *log,
+				const struct btf *btf,
+				const struct btf_type *t, int off,
+				int size, enum bpf_access_type atype,
+				u32 *next_btf_id)
+{
+	const struct btf_type *nf_conn_type;
+	size_t end;
+
+	nf_conn_type = btf_type_by_id(btf, nf_conntrack_ids[5]);
+	if (!nf_conn_type)
+		return -EACCES;
+	/* This won't work (not even with btf_struct_ids_match for off == 0),
+	 * see below for the reason:
+	 * https://lore.kernel.org/bpf/20211028014428.rsuq6rkfvqzq23tg@apollo.localdomain
+	 */
+	if (t != nf_conn_type) /* skip */
+		return __BPF_REG_TYPE_MAX;
+
+	if (atype != BPF_READ)
+		return -EACCES;
+
+	switch (off) {
+	case offsetof(struct nf_conn, status):
+		end = offsetofend(struct nf_conn, status);
+		break;
+	/* TODO(v2): We should do it per field offset */
+	case bpf_ctx_range(struct nf_conn, proto):
+		end = offsetofend(struct nf_conn, proto);
+		break;
+	default:
+		return -EACCES;
+	}
+
+	if (off + size > end)
+		return -EACCES;
+
+	return NOT_INIT;
+}
+
+static struct kfunc_btf_id_set nf_ct_xdp_kfunc_set = {
+	.owner                 = THIS_MODULE,
+	.set                   = &nf_conntrack_xdp_ids,
+	.is_acquire_kfunc      = nf_is_acquire_kfunc,
+	.is_release_kfunc      = nf_is_release_kfunc,
+	.get_kfunc_return_type = nf_get_kfunc_return_type,
+	.btf_struct_access     = nf_btf_struct_access,
+};
+
+static struct kfunc_btf_id_set nf_ct_skb_kfunc_set = {
+	.owner                 = THIS_MODULE,
+	.set                   = &nf_conntrack_skb_ids,
+	.is_acquire_kfunc      = nf_is_acquire_kfunc,
+	.is_release_kfunc      = nf_is_release_kfunc,
+	.get_kfunc_return_type = nf_get_kfunc_return_type,
+	.btf_struct_access     = nf_btf_struct_access,
+};
+
 void nf_conntrack_cleanup_start(void)
 {
 	conntrack_gc_work.exiting = true;
@@ -2459,6 +2708,9 @@ void nf_conntrack_cleanup_start(void)
 
 void nf_conntrack_cleanup_end(void)
 {
+	unregister_kfunc_btf_id_set(&xdp_kfunc_list, &nf_ct_xdp_kfunc_set);
+	unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &nf_ct_skb_kfunc_set);
+
 	RCU_INIT_POINTER(nf_ct_hook, NULL);
 	cancel_delayed_work_sync(&conntrack_gc_work.dwork);
 	kvfree(nf_conntrack_hash);
@@ -2745,6 +2997,9 @@ int nf_conntrack_init_start(void)
 	conntrack_gc_work_init(&conntrack_gc_work);
 	queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
 
+	register_kfunc_btf_id_set(&prog_test_kfunc_list, &nf_ct_skb_kfunc_set);
+	register_kfunc_btf_id_set(&xdp_kfunc_list, &nf_ct_xdp_kfunc_set);
+
 	return 0;
 
 err_proto:
-- 
2.33.1