This change adds conntrack lookup helpers using the unstable kfunc call interface for the XDP and TC-BPF hooks. The primary usecase is implementing a synproxy in XDP, see Maxim's patchset at [0]. Export get_net_ns_by_id as nf_conntrack needs to call it. Note that we search for acquire, release, and null returning kfuncs in the intersection of those sets and main set. This implies that the kfunc_btf_id_list acq_set, rel_set, null_set may contain BTF ID not in main set, this is explicitly allowed and recommended (to save on definining more and more sets), since check_kfunc_call verifier operation would filter out the invalid BTF ID fairly early, so later checks for acquire, release, and ret_type_null kfunc will only consider allowed BTF IDs for that program that are allowed in main set. This is why the nf_conntrack_acq_ids set has BTF IDs for both xdp and tc hook kfuncs. [0]: https://lore.kernel.org/bpf/20211019144655.3483197-1-maximmi@xxxxxxxxxx Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> --- include/linux/btf.h | 2 + kernel/bpf/btf.c | 1 + net/core/filter.c | 24 +++ net/core/net_namespace.c | 1 + net/netfilter/nf_conntrack_core.c | 278 ++++++++++++++++++++++++++++++ 5 files changed, 306 insertions(+) diff --git a/include/linux/btf.h b/include/linux/btf.h index 64c3784799c5..289d9db6748b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -363,6 +363,7 @@ bool bpf_is_mod_kfunc_ret_type_null(struct kfunc_btf_id_list *klist, extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; extern struct kfunc_btf_id_list prog_test_kfunc_list; +extern struct kfunc_btf_id_list xdp_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -396,6 +397,7 @@ bpf_is_mod_kfunc_ret_type_null(struct kfunc_btf_id_list *klist, u32 kfunc_id, static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list xdp_kfunc_list __maybe_unused; #endif #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4983b54c1d81..bce1f98177b9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6558,6 +6558,7 @@ bool bpf_is_mod_kfunc_ret_type_null(struct kfunc_btf_id_list *klist, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); +DEFINE_KFUNC_BTF_ID_LIST(xdp_kfunc_list); #endif diff --git a/net/core/filter.c b/net/core/filter.c index 3f656391af7e..e5efacaa6175 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10008,11 +10008,35 @@ const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +static bool xdp_check_kfunc_call(u32 kfunc_id, struct module *owner) +{ + return bpf_check_mod_kfunc_call(&xdp_kfunc_list, kfunc_id, owner); +} + +static bool xdp_is_acquire_kfunc(u32 kfunc_id, struct module *owner) +{ + return bpf_is_mod_acquire_kfunc(&xdp_kfunc_list, kfunc_id, owner); +} + +static bool xdp_is_release_kfunc(u32 kfunc_id, struct module *owner) +{ + return bpf_is_mod_release_kfunc(&xdp_kfunc_list, kfunc_id, owner); +} + +static bool xdp_is_kfunc_ret_type_null(u32 kfunc_id, struct module *owner) +{ + return bpf_is_mod_kfunc_ret_type_null(&xdp_kfunc_list, kfunc_id, owner); +} + const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .check_kfunc_call = xdp_check_kfunc_call, + .is_acquire_kfunc = xdp_is_acquire_kfunc, + .is_release_kfunc = xdp_is_release_kfunc, + .is_kfunc_ret_type_null = xdp_is_kfunc_ret_type_null, }; const struct bpf_prog_ops xdp_prog_ops = { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9b7171c40434..3b471781327f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -299,6 +299,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id) return peer; } +EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* * setup_net runs the initializers for the network namespace object. diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9fbce31baf75..116e1384e446 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -11,6 +11,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> @@ -2457,8 +2460,280 @@ void nf_conntrack_cleanup_start(void) RCU_INIT_POINTER(ip_ct_attach, NULL); } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + +/* Unstable Conntrack Helpers for XDP and TC-BPF hook + * + * These are called from the XDP and SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +/* bpf_ct_opts - Options for CT lookup helpers + * + * Members: + * @error - Out parameter, set for any errors encountered + * Values: + * -EINVAL - Passed NULL for bpf_tuple pointer + * -EINVAL - opts->reserved is not 0 + * -EINVAL - netns_id is less than -1 + * -EINVAL - len__opts isn't NF_BPF_CT_OPTS_SZ (12) + * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP + * -ENONET - No network namespace found for netns_id + * -ENOENT - Conntrack lookup could not find entry for tuple + * -EAFNOSUPPORT - len__tuple isn't one of sizeof(tuple->ipv4) + * or sizeof(tuple->ipv6) + * @l4proto - Layer 4 protocol + * Values: + * IPPROTO_TCP, IPPROTO_UDP + * @reserved - Reserved member, will be reused for more options in future + * Values: + * 0 + * @netns_id - Specify the network namespace for lookup + * Values: + * BPF_F_CURRENT_NETNS (-1) + * Use namespace associated with ctx (xdp_md, __sk_buff) + * [0, S32_MAX] + * Network Namespace ID + */ +struct bpf_ct_opts { + s32 netns_id; + s32 error; + u8 l4proto; + u8 reserved[3]; +}; + +enum { + NF_BPF_CT_OPTS_SZ = 12, +}; + +static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, + struct bpf_sock_tuple *bpf_tuple, + u32 tuple_len, u8 protonum, + s32 netns_id) +{ + struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple tuple; + + if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) + return ERR_PTR(-EPROTO); + if (unlikely(netns_id < BPF_F_CURRENT_NETNS)) + return ERR_PTR(-EINVAL); + + memset(&tuple, 0, sizeof(tuple)); + switch (tuple_len) { + case sizeof(bpf_tuple->ipv4): + tuple.src.l3num = AF_INET; + tuple.src.u3.ip = bpf_tuple->ipv4.saddr; + tuple.src.u.tcp.port = bpf_tuple->ipv4.sport; + tuple.dst.u3.ip = bpf_tuple->ipv4.daddr; + tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport; + break; + case sizeof(bpf_tuple->ipv6): + tuple.src.l3num = AF_INET6; + memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); + tuple.src.u.tcp.port = bpf_tuple->ipv6.sport; + memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); + tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport; + break; + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + tuple.dst.protonum = protonum; + + if (netns_id >= 0) { + net = get_net_ns_by_id(net, netns_id); + if (unlikely(!net)) + return ERR_PTR(-ENONET); + } + + hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); + if (netns_id >= 0) + put_net(net); + if (!hash) + return ERR_PTR(-ENOENT); + return nf_ct_tuplehash_to_ctrack(hash); +} + +__diag_push(); +__diag_ignore(GCC, 8, "-Wmissing-prototypes", + "Global functions as their definitions will be in nf_conntrack BTF"); + +/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a + * reference to it + * + * Parameters: + * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @len__tuple - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for lookup (documented above) + * Cannot be NULL + * @len__opts - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (12) + */ +struct nf_conn * +bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 len__tuple, struct bpf_ct_opts *opts, u32 len__opts) +{ + struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; + struct net *caller_net; + struct nf_conn *nfct; + + BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ); + + if (!opts) + return NULL; + if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] || + opts->reserved[2] || len__opts != NF_BPF_CT_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + caller_net = dev_net(ctx->rxq->dev); + nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, len__tuple, opts->l4proto, + opts->netns_id); + if (IS_ERR(nfct)) { + opts->error = PTR_ERR(nfct); + return NULL; + } + return nfct; +} + +/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a + * reference to it + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @len__tuple - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for lookup (documented above) + * Cannot be NULL + * @len__opts - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (12) + */ +struct nf_conn * +bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 len__tuple, struct bpf_ct_opts *opts, u32 len__opts) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct net *caller_net; + struct nf_conn *nfct; + + BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ); + + if (!opts) + return NULL; + if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] || + opts->reserved[2] || len__opts != NF_BPF_CT_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); + nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, len__tuple, opts->l4proto, + opts->netns_id); + if (IS_ERR(nfct)) { + opts->error = PTR_ERR(nfct); + return NULL; + } + return nfct; +} + +/* bpf_ct_release - Release acquired nf_conn object + * + * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects + * the program if any references remain in the program in all of the explored + * states. + * + * Parameters: + * @nf_conn - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. + */ +void bpf_ct_release(struct nf_conn *nfct) +{ + if (!nfct) + return; + nf_ct_put(nfct); +} + +__diag_pop() + +/* XDP hook allowed kfuncs */ +BTF_SET_START(bpf_nf_ct_xdp_ids) +BTF_ID(func, bpf_xdp_ct_lookup) +BTF_ID(func, bpf_ct_release) +BTF_SET_END(bpf_nf_ct_xdp_ids) + +/* TC-BPF hook allowed kfuncs */ +BTF_SET_START(bpf_nf_ct_skb_ids) +BTF_ID(func, bpf_skb_ct_lookup) +BTF_ID(func, bpf_ct_release) +BTF_SET_END(bpf_nf_ct_skb_ids) + +/* XDP and TC-BPF hook acquire kfuncs */ +BTF_SET_START(bpf_nf_ct_acq_ids) +BTF_ID(func, bpf_xdp_ct_lookup) +BTF_ID(func, bpf_skb_ct_lookup) +BTF_SET_END(bpf_nf_ct_acq_ids) + +/* XDP and TC-BPF hook release kfuncs */ +BTF_SET_START(bpf_nf_ct_rel_ids) +BTF_ID(func, bpf_ct_release) +BTF_SET_END(bpf_nf_ct_rel_ids) + +/* kfuncs that may return NULL PTR_TO_BTF_ID */ +BTF_SET_START(bpf_nf_ct_null_ids) +BTF_ID(func, bpf_xdp_ct_lookup) +BTF_ID(func, bpf_skb_ct_lookup) +BTF_SET_END(bpf_nf_ct_null_ids) + +#else + +BTF_SET_START(bpf_nf_ct_xdp_ids) +BTF_SET_END(bpf_nf_ct_xdp_ids) + +BTF_SET_START(bpf_nf_ct_skb_ids) +BTF_SET_END(bpf_nf_ct_skb_ids) + +BTF_SET_START(bpf_nf_ct_acq_ids) +BTF_SET_END(bpf_nf_ct_acq_ids) + +BTF_SET_START(bpf_nf_ct_rel_ids) +BTF_SET_END(bpf_nf_ct_rel_ids) + +BTF_SET_START(bpf_nf_ct_null_ids) +BTF_SET_END(bpf_nf_ct_null_ids) + +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +static struct kfunc_btf_id_set nf_ct_xdp_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_nf_ct_xdp_ids, + .acq_set = &bpf_nf_ct_acq_ids, + .rel_set = &bpf_nf_ct_rel_ids, + .null_set = &bpf_nf_ct_null_ids, +}; + +static struct kfunc_btf_id_set nf_ct_skb_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_nf_ct_skb_ids, + .acq_set = &bpf_nf_ct_acq_ids, + .rel_set = &bpf_nf_ct_rel_ids, + .null_set = &bpf_nf_ct_null_ids, +}; + void nf_conntrack_cleanup_end(void) { + unregister_kfunc_btf_id_set(&xdp_kfunc_list, &nf_ct_xdp_kfunc_set); + unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &nf_ct_skb_kfunc_set); + RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); kvfree(nf_conntrack_hash); @@ -2745,6 +3020,9 @@ int nf_conntrack_init_start(void) conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); + register_kfunc_btf_id_set(&prog_test_kfunc_list, &nf_ct_skb_kfunc_set); + register_kfunc_btf_id_set(&xdp_kfunc_list, &nf_ct_xdp_kfunc_set); + return 0; err_proto: -- 2.34.1