Add new instruction for the nf_tables VM that allows us to specify what flows are offloaded. This has an explicit dependency with the conntrack subsystem. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- include/uapi/linux/netfilter/nf_tables.h | 9 + net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 1 + net/netfilter/nft_flow_offload.c | 331 +++++++++++++++++++++++++++++++ 4 files changed, 348 insertions(+) create mode 100644 net/netfilter/nft_flow_offload.c diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 871afa4871bf..2edde548de68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -948,6 +948,15 @@ enum nft_ct_attributes { }; #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) +/** + * enum nft_ct_offload_attributes - ct offload expression attributes + */ +enum nft_offload_attributes { + NFTA_CT_OFFLOAD_UNSPEC, + __NFTA_CT_OFFLOAD_MAX, +}; +#define NFTA_CT_OFFLOAD_MAX (__NFTA_CT_OFFLOAD_MAX - 1) + enum nft_limit_type { NFT_LIMIT_PKTS, NFT_LIMIT_PKT_BYTES diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f022ca91f49d..0a5c33cfaeb8 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -504,6 +504,13 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_FLOW_OFFLOAD + depends on NF_CONNTRACK + tristate "Netfilter nf_tables hardware flow offload module" + help + This option adds the "flow_offload" expression that you can use to + choose what flows are placed into the hardware. + config NFT_SET_RBTREE tristate "Netfilter nf_tables rbtree set module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 518f54113e06..801ce5c25e5d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_NFT_META) += nft_meta.o obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o +obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_OBJREF) += nft_objref.o diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c new file mode 100644 index 000000000000..d38d185a19a5 --- /dev/null +++ b/net/netfilter/nft_flow_offload.c @@ -0,0 +1,331 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_tables.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <linux/netfilter/nf_conntrack_common.h> + +union flow_gateway { + __be32 ip; + struct in6_addr ip6; +}; + +static int flow_offload_iterate_cleanup(struct nf_conn *ct, void *data) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple tuple = {}; + struct net_device *indev = data; + struct flow_offload *flow; + + if (!test_and_clear_bit(IPS_OFFLOAD_BIT, &ct->status)) + return 0; + + tuple.src_v4 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; + tuple.dst_v4 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; + tuple.src_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; + tuple.dst_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + tuple.l3proto = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + tuple.l4proto = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + + tuplehash = flow_offload_lookup(&tuple); + BUG_ON(!tuplehash); + + if (indev && tuplehash->tuple.iifidx != indev->ifindex) + return 0; + + flow = container_of(tuplehash, struct flow_offload, + tuplehash[tuplehash->tuple.dir]); + + flow_offload_del(flow); + + /* Do not remove this conntrack from table. */ + return 0; +} + +static void flow_offload_cleanup(struct net *net, + const struct net_device *dev) +{ + nf_ct_iterate_cleanup_net(net, flow_offload_iterate_cleanup, + (void *)dev, 0, 0); +} + +static int flow_offload_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + flow_offload_cleanup(dev_net(dev), dev); + + return NOTIFY_DONE; +} + +static struct notifier_block flow_offload_netdev_notifier = { + .notifier_call = flow_offload_netdev_event, +}; + +static struct flow_offload * +flow_offload_alloc(const struct nf_conn *ct, int iifindex, int oifindex, + union flow_gateway *orig_gateway, + union flow_gateway *reply_gateway) +{ + struct flow_offload *flow; + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + return NULL; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { + case NFPROTO_IPV4: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway = + orig_gateway->ip; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway = + reply_gateway->ip; + break; + case NFPROTO_IPV6: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway6 = + orig_gateway->ip6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway6 = + reply_gateway->ip6; + break; + } + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = FLOW_OFFLOAD_DIR_ORIGINAL; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = FLOW_OFFLOAD_DIR_REPLY; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = oifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = iifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = iifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = oifindex; + + if (ct->status & IPS_SRC_NAT) + flow->flags |= FLOW_OFFLOAD_SNAT; + else if (ct->status & IPS_DST_NAT) + flow->flags |= FLOW_OFFLOAD_DNAT; + + return flow; +} + +static int nft_flow_route(const struct nft_pktinfo *pkt, + const struct nf_conn *ct, + union flow_gateway *orig_gw, + union flow_gateway *reply_gw) +{ + const struct dst_entry *reply_dst = skb_dst(pkt->skb); + struct dst_entry *orig_dst; + const struct nf_afinfo *ai; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; + break; + } + + ai = nf_get_afinfo(nft_pf(pkt)); + if (ai) { + ai->route(nft_net(pkt), &orig_dst, &fl, false); + if (!orig_dst) + return -ENOENT; + } + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: { + const struct rtable *orig_rt = (const struct rtable *)orig_dst; + const struct rtable *reply_rt = + (const struct rtable *)reply_dst; + + orig_gw->ip = orig_rt->rt_gateway; + reply_gw->ip = reply_rt->rt_gateway; + break; + } + case NFPROTO_IPV6: + break; + default: + break; + } + + dst_release(orig_dst); + + return 0; +} + +static void nft_flow_offload_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + union flow_gateway orig_gateway, reply_gateway; + struct net_device *outdev = pkt->xt.state->out; + struct net_device *indev = pkt->xt.state->in; + enum ip_conntrack_info ctinfo; + struct flow_offload *flow; + struct nf_conn *ct; + int ret; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) + goto out; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + goto out; + } + + if (test_bit(IPS_HELPER_BIT, &ct->status)) + goto out; + + if (ctinfo == IP_CT_NEW || + ctinfo == IP_CT_RELATED) + goto out; + + if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) + goto out; + + if (nft_flow_route(pkt, ct, &orig_gateway, &reply_gateway) < 0) + goto err1; + + flow = flow_offload_alloc(ct, indev->ifindex, outdev->ifindex, + &orig_gateway, &reply_gateway); + if (!flow) + goto err1; + + ret = flow_offload_add(flow); + if (ret < 0) + goto err2; + + return; +err2: + kfree(flow); +err1: + clear_bit(IPS_OFFLOAD_BIT, &ct->status); +out: + regs->verdict.code = NFT_BREAK; +} + +static int nft_flow_offload_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + unsigned int hook_mask = (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hook_mask); +} + +static int nft_flow_offload_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + return nf_ct_netns_get(ctx->net, ctx->afi->family); +} + +static void nft_flow_offload_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, ctx->afi->family); +} + +static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + return 0; +} + +struct nft_expr_type nft_flow_offload_type; +static const struct nft_expr_ops nft_flow_offload_ops = { + .type = &nft_flow_offload_type, + .size = NFT_EXPR_SIZE(0), + .eval = nft_flow_offload_eval, + .init = nft_flow_offload_init, + .destroy = nft_flow_offload_destroy, + .validate = nft_flow_offload_validate, + .dump = nft_flow_offload_dump, +}; + +struct nft_expr_type nft_flow_offload_type __read_mostly = { + .name = "flow_offload", + .ops = &nft_flow_offload_ops, + .maxattr = NFTA_CT_OFFLOAD_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_flow_offload_module_init(void) +{ + register_netdevice_notifier(&flow_offload_netdev_notifier); + + return nft_register_expr(&nft_flow_offload_type); +} + +static void __exit nft_flow_offload_module_exit(void) +{ + struct net *net; + + nft_unregister_expr(&nft_flow_offload_type); + unregister_netdevice_notifier(&flow_offload_netdev_notifier); + rtnl_lock(); + for_each_net(net) + flow_offload_cleanup(net, NULL); + rtnl_unlock(); +} + +module_init(nft_flow_offload_module_init); +module_exit(nft_flow_offload_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>"); +MODULE_ALIAS_NFT_EXPR("flow_offload"); -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html