This patch adds the generic software flow offload infrastructure. This allows users to configure fast path for established flows that will not follow the classic forwarding path. This adds a new hook at netfilter ingress for each existing interface. For each packet that hits the hook, we look up for an existing flow in the table, if there is a hit, the packet is forwarded by using the gateway and interfaces that are cached in the flow table entry. This comes with a kernel thread to release flow table entries if no packets are seen after a little while, so the flow table entry is released. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- include/net/flow_offload.h | 67 +++++++ net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 3 + net/netfilter/nf_flow_offload.c | 386 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 463 insertions(+) create mode 100644 include/net/flow_offload.h create mode 100644 net/netfilter/nf_flow_offload.c diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h new file mode 100644 index 000000000000..30bfca7ed3f1 --- /dev/null +++ b/include/net/flow_offload.h @@ -0,0 +1,67 @@ +#ifndef _FLOW_OFFLOAD_H +#define _FLOW_OFFLOAD_H + +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/rhashtable.h> +#include <linux/rcupdate.h> + +enum flow_offload_tuple_dir { + FLOW_OFFLOAD_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY, + __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, +}; +#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) + +struct flow_offload_tuple { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + struct { + __be16 src_port; + __be16 dst_port; + }; + + u8 l3proto; + u8 l4proto; + u8 dir; + + int iifidx; + int oifidx; + + union { + __be32 gateway; + struct in6_addr gateway6; + }; +}; + +struct flow_offload_tuple_rhash { + struct rhash_head node; + struct flow_offload_tuple tuple; +}; + +#define FLOW_OFFLOAD_SNAT 0x1 +#define FLOW_OFFLOAD_DNAT 0x2 +#define FLOW_OFFLOAD_HW 0x4 + +struct flow_offload { + struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + u32 flags; + union { + /* Your private driver data here. */ + u32 timeout; + }; + struct rcu_head rcu_head; +}; + +int flow_offload_add(struct flow_offload *flow); +void flow_offload_del(struct flow_offload *flow); +struct flow_offload_tuple_rhash *flow_offload_lookup(struct flow_offload_tuple *tuple); + +#endif /* _FLOW_OFFLOAD_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e4a13cc8a2e7..f022ca91f49d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -436,6 +436,13 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK +config NF_FLOW_OFFLOAD + tristate "Netfilter Generic Flow Offload (GFO) module" + help + This option adds the flow table core infrastructure. + + To compile it as a module, choose M here. + config NF_TABLES select NETFILTER_NETLINK tristate "Netfilter nf_tables support" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d3891c93edd6..518f54113e06 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -69,6 +69,9 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # generic packet duplication from netdev family obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o +# generic flow table +obj-$(CONFIG_NF_FLOW_OFFLOAD)+= nf_flow_offload.o + # nf_tables nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \ nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \ diff --git a/net/netfilter/nf_flow_offload.c b/net/netfilter/nf_flow_offload.c new file mode 100644 index 000000000000..c967b29d11a6 --- /dev/null +++ b/net/netfilter/nf_flow_offload.c @@ -0,0 +1,386 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/neighbour.h> +#include <net/flow_offload.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmpv6.h> + +static struct rhashtable flow_table; + +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, l4proto), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, l4proto), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct flow_offload_tuple_rhash *x = ptr; + const struct flow_offload_tuple *tuple = arg->key; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, l4proto))) + return 1; + + return 0; +} + +static const struct rhashtable_params flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; + +#define NF_FLOW_LIFETIME 15 + +int flow_offload_add(struct flow_offload *flow) +{ + flow->timeout = (u32)jiffies; + + rhashtable_insert_fast(&flow_table, &flow->tuplehash[0].node, + flow_offload_rhash_params); + rhashtable_insert_fast(&flow_table, &flow->tuplehash[1].node, + flow_offload_rhash_params); + return 0; +} +EXPORT_SYMBOL_GPL(flow_offload_add); + +void flow_offload_del(struct flow_offload *flow) +{ + rhashtable_remove_fast(&flow_table, &flow->tuplehash[0].node, + flow_offload_rhash_params); + rhashtable_remove_fast(&flow_table, &flow->tuplehash[1].node, + flow_offload_rhash_params); + kfree_rcu(flow, rcu_head); +} +EXPORT_SYMBOL_GPL(flow_offload_del); + +struct flow_offload_tuple_rhash * +flow_offload_lookup(struct flow_offload_tuple *tuple) +{ + return rhashtable_lookup_fast(&flow_table, tuple, + flow_offload_rhash_params); +} +EXPORT_SYMBOL_GPL(flow_offload_lookup); + +static void nf_flow_offload_work_gc(struct work_struct *work); + +static DECLARE_DEFERRABLE_WORK(nf_flow_offload_gc, + nf_flow_offload_work_gc); + +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return (__s32)(flow->timeout - (u32)jiffies) <= 0; +} + +static void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err, counter = 0; + + rhashtable_walk_init(&flow_table, &hti, GFP_KERNEL); + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + if (nf_flow_has_expired(flow)) + flow_offload_del(flow); + + counter++; + } + + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + +out: + queue_delayed_work(system_power_efficient_wq, &nf_flow_offload_gc, + msecs_to_jiffies(1000)); +} + +static int nf_flow_snat_tcp(struct iphdr *iph, + const struct flow_offload *flow, + struct sk_buff *skb, + unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_snat_udp(struct iphdr *iph, + const struct flow_offload *flow, + struct sk_buff *skb, + unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_snat(struct iphdr *iph, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, struct sk_buff *skb) +{ + __be32 new_addr, addr; + unsigned int thoff; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + ip_decrease_ttl(iph); + + thoff = iph->ihl * 4; + + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_snat_tcp(iph, flow, skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_snat_udp(iph, flow, skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +/* Similar to rt_nexthop(). */ +static inline __be32 nf_flow_nexthop(__be32 nexthop, __be32 daddr) +{ + if (nexthop) + return nexthop; + + return daddr; +} + +struct flow_ports { + __be16 src, dst; +}; + +static int nf_flow_tuple_ip(struct iphdr *iph, struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->src; + tuple->dst_port = ports->dst; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + + return 0; +} + +#define NF_FLOW_TIMEOUT (30 * HZ) + +static unsigned int +nf_flow_offload_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple tuple = {}; + struct flow_offload *flow; + struct net_device *outdev; + struct iphdr *iph; + __be32 nexthop; + int err; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*iph))) + return NF_ACCEPT; + + iph = ip_hdr(skb); + if (ip_is_fragment(iph)) + return NF_ACCEPT; + + err = nf_flow_tuple_ip(iph, skb, &tuple); + if (err < 0) + return NF_ACCEPT; + break; + default: + return NF_ACCEPT; + } + + tuplehash = flow_offload_lookup(&tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(&init_net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + flow = container_of(tuplehash, struct flow_offload, + tuplehash[tuplehash->tuple.dir]); + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + + if (flow->flags & FLOW_OFFLOAD_SNAT && + nf_flow_snat(iph, flow, tuplehash->tuple.dir, skb) < 0) + return NF_DROP; + + skb->dev = outdev; + nexthop = nf_flow_nexthop(tuplehash->tuple.gateway, iph->daddr); + + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} + +static LIST_HEAD(nf_flow_hook_list); + +struct nf_flow_hook_entry { + struct list_head head; + struct nf_hook_ops ops; +}; + +static int __init nf_flow_offload_module_init(void) +{ + struct rhashtable_params params = flow_offload_rhash_params; + struct nf_hook_ops flow_offload_hook = { + .hook = nf_flow_offload_hook, + .pf = NFPROTO_NETDEV, + .hooknum = NF_NETDEV_INGRESS, + .priority = -100, + }; + struct nf_flow_hook_entry *entry; + struct net_device *dev; + int err; + + params.key_len = offsetof(struct flow_offload_tuple, dir); + err = rhashtable_init(&flow_table, ¶ms); + if (err < 0) + return err; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + rtnl_unlock(); + return -ENOMEM; + } + entry->ops = flow_offload_hook; + entry->ops.dev = dev; + list_add_tail(&entry->head, &nf_flow_hook_list); + + err = nf_register_net_hook(&init_net, &entry->ops); + if (err < 0) + return err; + + pr_info("register flow table for device %s\n", dev->name); + } + rtnl_unlock(); + + queue_delayed_work(system_power_efficient_wq, &nf_flow_offload_gc, + msecs_to_jiffies(1000)); + return err; +} + +static void flow_offload_destroy(void *ptr, void *arg) +{ + kfree(ptr); +} + +static void __exit nf_flow_offload_module_exit(void) +{ + struct nf_flow_hook_entry *entry, *next; + + cancel_delayed_work_sync(&nf_flow_offload_gc); + list_for_each_entry_safe(entry, next, &nf_flow_hook_list, head) { + pr_info("unregister flow table for device %s\n", + entry->ops.dev->name); + nf_unregister_net_hook(&init_net, &entry->ops); + list_del(&entry->head); + kfree(entry); + } + rhashtable_free_and_destroy(&flow_table, flow_offload_destroy, NULL); +} + +module_init(nf_flow_offload_module_init); +module_exit(nf_flow_offload_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>"); -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html