This patch defines the API to interact with flow tables, this allows to add, delete and lookup for entries in the flow table. This also adds the generic garbage code that removes entries that have expired, ie. no traffic has been seen for a while. Users of the flow table infrastructure can delete entries via flow_offload_dead(), which sets the dying bit, this signals the garbage collector to release an entry from user context. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- include/net/netfilter/nf_flow_table.h | 94 ++++++++ net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 3 + net/netfilter/nf_flow_table.c | 434 ++++++++++++++++++++++++++++++++++ 4 files changed, 538 insertions(+) create mode 100644 net/netfilter/nf_flow_table.c diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 3a0779589281..161f71ca78a0 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -1,7 +1,12 @@ #ifndef _NF_FLOW_TABLE_H #define _NF_FLOW_TABLE_H +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> #include <linux/rhashtable.h> +#include <linux/rcupdate.h> +#include <net/dst.h> struct nf_flowtable; @@ -20,4 +25,93 @@ struct nf_flowtable { struct delayed_work gc_work; }; +enum flow_offload_tuple_dir { + FLOW_OFFLOAD_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY, + __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, +}; +#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) + +struct flow_offload_tuple { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + struct { + __be16 src_port; + __be16 dst_port; + }; + + int iifidx; + + u8 l3proto; + u8 l4proto; + u8 dir; + + int oifidx; + + struct dst_entry *dst_cache; +}; + +struct flow_offload_tuple_rhash { + struct rhash_head node; + struct flow_offload_tuple tuple; +}; + +#define FLOW_OFFLOAD_SNAT 0x1 +#define FLOW_OFFLOAD_DNAT 0x2 +#define FLOW_OFFLOAD_DYING 0x4 + +struct flow_offload { + struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + u32 flags; + union { + /* Your private driver data here. */ + u32 timeout; + }; +}; + +#define NF_FLOW_TIMEOUT (30 * HZ) + +struct nf_flow_route { + struct { + struct dst_entry *dst; + int ifindex; + } tuple[FLOW_OFFLOAD_DIR_MAX]; +}; + +struct flow_offload *flow_offload_alloc(struct nf_conn *ct, + struct nf_flow_route *route); +void flow_offload_free(struct flow_offload *flow); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); +void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow); +struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple); +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data); +void nf_flow_offload_work_gc(struct work_struct *work); +extern const struct rhashtable_params nf_flow_offload_rhash_params; + +void flow_offload_dead(struct flow_offload *flow); + +int nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); +int nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); + +struct flow_ports { + __be16 source, dest; +}; + +#define MODULE_ALIAS_NF_FLOWTABLE(family) \ + MODULE_ALIAS("nf-flowtable-" __stringify(family)) + #endif /* _FLOW_OFFLOAD_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e4a13cc8a2e7..af0f58322515 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -649,6 +649,13 @@ endif # NF_TABLES_NETDEV endif # NF_TABLES +config NF_FLOW_TABLE + tristate "Netfilter flow table module" + help + This option adds the flow table core infrastructure. + + To compile it as a module, choose M here. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d3891c93edd6..1f7d92bd571a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -106,6 +106,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o +# flow table infrastructure +obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c new file mode 100644 index 000000000000..e1024b17b910 --- /dev/null +++ b/net/netfilter/nf_flow_table.c @@ -0,0 +1,434 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +struct flow_offload_entry { + struct flow_offload flow; + struct nf_conn *ct; + struct rcu_head rcu_head; +}; + +struct flow_offload * +flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +{ + struct flow_offload_entry *entry; + struct flow_offload *flow; + + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + goto err_ct_refcnt; + + flow = &entry->flow; + + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) + goto err_dst_cache_original; + + if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) + goto err_dst_cache_reply; + + entry->ct = ct; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { + case NFPROTO_IPV4: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + break; + case NFPROTO_IPV6: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + break; + } + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = + FLOW_OFFLOAD_DIR_ORIGINAL; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = + FLOW_OFFLOAD_DIR_REPLY; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = + route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = + route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + + if (ct->status & IPS_SRC_NAT) + flow->flags |= FLOW_OFFLOAD_SNAT; + else if (ct->status & IPS_DST_NAT) + flow->flags |= FLOW_OFFLOAD_DNAT; + + return flow; + +err_dst_cache_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); +err_dst_cache_original: + kfree(entry); +err_ct_refcnt: + nf_ct_put(ct); + + return NULL; +} +EXPORT_SYMBOL_GPL(flow_offload_alloc); + +void flow_offload_free(struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); + e = container_of(flow, struct flow_offload_entry, flow); + kfree(e); +} +EXPORT_SYMBOL_GPL(flow_offload_free); + +void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} +EXPORT_SYMBOL_GPL(flow_offload_dead); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) +{ + flow->timeout = (u32)jiffies; + + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, + *flow_table->type->params); + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, + *flow_table->type->params); + return 0; +} +EXPORT_SYMBOL_GPL(flow_offload_add); + +void flow_offload_del(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, + *flow_table->type->params); + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, + *flow_table->type->params); + + e = container_of(flow, struct flow_offload_entry, flow); + kfree_rcu(e, rcu_head); +} +EXPORT_SYMBOL_GPL(flow_offload_del); + +struct flow_offload_tuple_rhash * +flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple) +{ + return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, + *flow_table->type->params); +} +EXPORT_SYMBOL_GPL(flow_offload_lookup); + +static void nf_flow_release_ct(const struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + e = container_of(flow, struct flow_offload_entry, flow); + nf_ct_delete(e->ct, 0, 0); + nf_ct_put(e->ct); +} + +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err; + + rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + iter(flow, data); + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + return err; +} +EXPORT_SYMBOL_GPL(nf_flow_table_iterate); + +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return (__s32)(flow->timeout - (u32)jiffies) <= 0; +} + +static inline bool nf_flow_is_dying(const struct flow_offload *flow) +{ + return flow->flags & FLOW_OFFLOAD_DYING; +} + +void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err; + + flow_table = container_of(work, struct nf_flowtable, gc_work.work); + + rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + if (nf_flow_has_expired(flow) || + nf_flow_is_dying(flow)) { + flow_offload_del(flow_table, flow); + nf_flow_release_ct(flow); + } + } + + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); +out: + queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); +} +EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); + +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; +} + +const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params); + +static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + + return 0; +} + +static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace2(&udph->check, skb, port, + new_port, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) +{ + switch (protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + } + + return 0; +} + +int nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + hdr->source = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + hdr->dest = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_snat_port); + +int nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; + hdr->dest = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; + hdr->source = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_dnat_port); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>"); -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html