This adds a JIT helper infrastructure to translate nft expressions to ebpf programs. >From commit phase, we spawn jit module (a userspace program), and then provide the rules that came in this transaction to that program via a pipe (in nf_tables netlink format). The userspace helper translates the rules if possible, and installs the program(s) via bpf syscall. For each rule a small response containing the corresponding file descriptor (can be -1 on failure) and a attribute count (how many expressions were jitted) gets sent back to kernel via pipe. If translation fails, the rule is will be processed by nf_tables interpreter (as before this patch). If translation succeeded, nf_tables fetches the bpf program using the file descriptor identifier, allocates a new rule blob containing the new 'ebpf' expression (and possible trailing un-translated expressions). It then replaces the original rule in the transaction log with the new 'ebpf-rule'. The original rule is retained in a private area inside the epbf expression to be able to present the original expressions to userspace when 'nft list ruleset' is called. For easier review, this contains the kernel-side only. nf_tables_jit_work() will not do anything, yet. Unresolved issues: - maps and sets. It might be possible to add a new ebpf map type that just wraps the nft set infrastructure for lookups. This would allow nft userspace to continue to work as-is while not requiring new ebpf helper. - we should eventually support translating multiple (adjacent) rules into single program. If we do this kernel will need to track mapping of rules to program (to re-jit when a rule is changed. This isn't implemented so far, but can be added later. We will also need to dump the 'next' generation of the to-be-translated table. The kernel has this information, so its only a matter of serializing it back to userspace from the commit phase. Signed-off-by: Florian Westphal <fw@xxxxxxxxx> --- include/net/netfilter/nf_tables_core.h | 12 ++ net/netfilter/Kconfig | 7 ++ net/netfilter/Makefile | 8 +- net/netfilter/nf_tables_api.c | 5 + net/netfilter/nf_tables_core.c | 31 ++++- net/netfilter/nf_tables_jit.c | 139 +++++++++++++++++++++++ net/netfilter/nf_tables_jit/Makefile | 18 +++ net/netfilter/nf_tables_jit/main.c | 21 ++++ net/netfilter/nf_tables_jit/nf_tables_jit_kern.c | 33 ++++++ 9 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 net/netfilter/nf_tables_jit/Makefile create mode 100644 net/netfilter/nf_tables_jit/main.c create mode 100644 net/netfilter/nf_tables_jit/nf_tables_jit_kern.c diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 90087a84f127..e9b5cc20ec45 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -71,6 +71,18 @@ struct nft_ebpf { extern const struct nft_expr_ops nft_ebpf_fast_ops; +struct nft_jit_data_from_user { + int ebpf_fd; /* fd to get program from, or < 0 if jitter error */ + u32 expr_count; /* number of translated expressions */ +}; + +#if IS_ENABLED(CONFIG_NF_TABLES_JIT) +int nft_jit_commit(struct net *net); +#else +static inline int nft_jit_commit(struct net *net) { return 0; } +#endif +int nf_tables_jit_work(const struct sk_buff *nlskb, struct nft_ebpf *e); + extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3ec8886850b2..82162fe931bb 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -473,6 +473,13 @@ config NF_TABLES_NETDEV help This option enables support for the "netdev" table. +config NF_TABLES_JIT + bool "Netfilter nf_tables jit infrastructure" + depends on BPF + help + This option enables support for translation of nf_tables + expressions to ebpf. + config NFT_NUMGEN tristate "Netfilter nf_tables number generator module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 49c6e0a535f9..ecb371160cf7 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -76,8 +76,12 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nf_tables_jit.o + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o + +obj-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/ +nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit.o +nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/nf_tables_jit_kern.o +nf_tables-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit/nf_tables_jit_umh.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 89e61b2d048b..40c2de230400 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6092,6 +6092,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + int ret; + + ret = nft_jit_commit(net); + if (ret < 0) + return ret; /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 038a15243508..5557b2709f98 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -93,19 +93,46 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, return true; } +/* Dirty hack: pass nft_pktinfo in skb->cb[] */ +struct nft_jit_args_inet_cb { + /* cb[0] */ + u16 thoff; /* 0: unset */ + u16 lloff; /* 0: unset */ + + /* cb[1] */ + u16 l4proto; /* thoff = 0? unset */ + u16 reserved; + + /* 12 bytes left */ +}; + static void nft_ebpf_fast_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ebpf *priv = nft_expr_priv(expr); + struct nft_jit_args_inet_cb *jit_args; struct bpf_skb_data_end cb_saved; int ret; + BUILD_BUG_ON(sizeof(struct nft_jit_args_inet_cb) > QDISC_CB_PRIV_LEN); + memcpy(&cb_saved, pkt->skb->cb, sizeof(cb_saved)); + + jit_args = (void *)bpf_skb_cb(pkt->skb); + memset(jit_args, 0, sizeof(*jit_args)); + + if (skb_mac_header_was_set(pkt->skb)) + jit_args->lloff = skb_mac_header_len(pkt->skb); + + if (pkt->tprot_set) { + jit_args->thoff = pkt->xt.thoff; + jit_args->l4proto = pkt->tprot; + } + bpf_compute_data_pointers(pkt->skb); ret = BPF_PROG_RUN(priv->prog, pkt->skb); - memcpy(pkt->skb->cb, &cb_saved, sizeof(cb_saved)); switch (ret) { @@ -119,9 +146,9 @@ static void nft_ebpf_fast_eval(const struct nft_expr *expr, default: pr_debug("Unknown verdict %d\n", ret); regs->verdict.code = NF_DROP; - break; } } + DEFINE_STATIC_KEY_FALSE(nft_counters_enabled); static noinline void nft_update_chain_stats(const struct nft_chain *chain, diff --git a/net/netfilter/nf_tables_jit.c b/net/netfilter/nf_tables_jit.c index 415c2acfa471..a8f4696249bf 100644 --- a/net/netfilter/nf_tables_jit.c +++ b/net/netfilter/nf_tables_jit.c @@ -1,13 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <linux/filter.h> #include <linux/netfilter.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <linux/file.h> + +static int nft_jit_dump_ruleinfo(struct sk_buff *skb, + const struct nft_ctx *ctx, const struct nft_rule *rule) +{ + const struct nft_expr *expr, *next; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *list; + int ret; + u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWRULE); + + nlh = nlmsg_put(skb, ctx->portid, ctx->seq, type, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + return -EMSGSIZE; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); + + ret = nla_put_string(skb, NFTA_RULE_TABLE, ctx->table->name); + if (ret < 0) + return ret; + ret = nla_put_string(skb, NFTA_RULE_CHAIN, ctx->chain->name); + if (ret < 0) + return ret; + ret = nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle), + NFTA_RULE_PAD); + if (ret < 0) + return ret; + + list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); + if (list == NULL) + return -EMSGSIZE; + + nft_rule_for_each_expr(expr, next, rule) { + ret = nft_expr_dump(skb, NFTA_LIST_ELEM, expr); + if (ret) + return ret; + } + nla_nest_end(skb, list); + nlmsg_end(skb, nlh); + return 0; +} struct nft_ebpf_expression { struct nft_expr e; struct nft_ebpf priv; }; +static int nft_jit_rule(struct nft_trans *trans, struct sk_buff *skb) +{ + const struct nft_rule *r = nft_trans_rule(trans); + const struct nft_expr *e, *last; + struct nft_ebpf_expression ebpf = { 0 }; + struct nft_rule *rule; + struct nft_expr *new; + unsigned int size = sizeof(ebpf); + int err, expr_count; + + err = nft_jit_dump_ruleinfo(skb, &trans->ctx, nft_trans_rule(trans)); + if (err < 0) + return err; + + err = nf_tables_jit_work(skb, &ebpf.priv); + if (err < 0) + return err; + + if (!ebpf.priv.prog) + return 0; + + ebpf.priv.original = r; + + if (r->udata) { + struct nft_userdata *udata = nft_userdata(r); + + size += udata->len + 1; + } + + rule = kmalloc(sizeof(*rule) + r->dlen + size, GFP_KERNEL); + if (!rule) { + bpf_prog_put(ebpf.priv.prog); + return -ENOMEM; + } + + memcpy(rule, r, sizeof(*r)); + rule->dlen = r->dlen + sizeof(ebpf); + + new = nft_expr_first(rule); + memcpy(new, &ebpf, sizeof(ebpf)); + new->ops = &nft_ebpf_fast_ops; + size = sizeof(ebpf); + + expr_count = 0; + nft_rule_for_each_expr(e, last, r) { + ++expr_count; + if (expr_count <= ebpf.priv.expressions) + continue; /* expression was jitted */ + + new = nft_expr_next(new); + memcpy(new, e, e->ops->size); + size += e->ops->size; + } + + rule->dlen = size; + if (r->udata) { + const struct nft_userdata *udata = nft_userdata(r); + + memcpy(nft_userdata(rule), udata, udata->len + 1); + } + + list_replace_rcu(&nft_trans_rule(trans)->list, &rule->list); + nft_trans_rule(trans) = rule; + + return 0; +} + +int nft_jit_commit(struct net *net) +{ + struct nft_trans *trans; + struct sk_buff *skb; + int ret; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWRULE) + continue; + + ret = nft_jit_rule(trans, skb); + if (ret < 0) + break; + skb->head = skb->data; + skb_reset_tail_pointer(skb); + } + + kfree_skb(skb); + return ret; +} + static const struct nla_policy nft_ebpf_policy[NFTA_EBPF_MAX + 1] = { [NFTA_EBPF_FD] = { .type = NLA_S32 }, [NFTA_EBPF_ID] = { .type = NLA_U32 }, diff --git a/net/netfilter/nf_tables_jit/Makefile b/net/netfilter/nf_tables_jit/Makefile new file mode 100644 index 000000000000..aa7509e49589 --- /dev/null +++ b/net/netfilter/nf_tables_jit/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +# + +hostprogs-y := nf_tables_jit_umh +nf_tables_jit_umh-objs := main.o +HOSTCFLAGS += -I. -Itools/include/ + +quiet_cmd_copy_umh = GEN $@ + cmd_copy_umh = echo ':' > $(obj)/.nf_tables_jit_umh.o.cmd; \ + $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ + -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ + --rename-section .data=.rodata $< $@ + +$(obj)/nf_tables_jit_umh.o: $(obj)/nf_tables_jit_umh + $(call cmd,copy_umh) + +obj-$(CONFIG_NF_TABLES_JIT) += nf_tables_jit.o +nf_tables_jit-objs += nf_tables_jit_kern.o nf_tables_jit_umh.o diff --git a/net/netfilter/nf_tables_jit/main.c b/net/netfilter/nf_tables_jit/main.c new file mode 100644 index 000000000000..6f6a4423c2e4 --- /dev/null +++ b/net/netfilter/nf_tables_jit/main.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <unistd.h> + +int main(void) +{ + static struct { + int fd, count; + } response; + + response.fd = -1; + for (;;) { + char buf[8192]; + + if (read(0, buf, sizeof(buf)) < 0) + return 1; + if (write(1, &response, sizeof(response)) != sizeof(response)) + return 2; + } + + return 0; +} diff --git a/net/netfilter/nf_tables_jit/nf_tables_jit_kern.c b/net/netfilter/nf_tables_jit/nf_tables_jit_kern.c new file mode 100644 index 000000000000..4778f53b2683 --- /dev/null +++ b/net/netfilter/nf_tables_jit/nf_tables_jit_kern.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/umh.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +#define UMH_start _binary_net_netfilter_nf_tables_jit_nf_tables_jit_umh_start +#define UMH_end _binary_net_netfilter_nf_tables_jit_nf_tables_jit_umh_end + +extern char UMH_start; +extern char UMH_end; + +static struct umh_info info; + +static int nft_jit_load_umh(void) +{ + return fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); +} + +int nf_tables_jit_work(const struct sk_buff *nlskb, struct nft_ebpf *e) +{ + if (!info.pipe_to_umh) { + int ret = nft_jit_load_umh(); + if (ret) + return ret; + + if (WARN_ON(!info.pipe_to_umh)) + return -EINVAL; + } + + return 0; +} -- 2.16.4 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html