This patch defines the API to interact with flow tables, this allows to add, delete and lookup for entries in the flow table. This also adds the generic garbage code that removes entries that have expired, ie. no traffic has been seen for a while. Users of the flow table infrastructure can delete entries via flow_offload_dead(), which sets the dying bit, this signals the garbage collector to release an entry from user context. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- include/net/netfilter/nf_flow_table.h | 71 +++++++++ net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 3 + net/netfilter/nf_flow_table.c | 269 ++++++++++++++++++++++++++++++++++ 4 files changed, 350 insertions(+) create mode 100644 net/netfilter/nf_flow_table.c diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 429833dba2d9..1a2598b4a58f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -1,7 +1,11 @@ #ifndef _NF_FLOW_TABLE_H #define _NF_FLOW_TABLE_H +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> #include <linux/rhashtable.h> +#include <linux/rcupdate.h> struct nf_flowtable; @@ -21,4 +25,71 @@ struct nf_flowtable { struct delayed_work gc_work; }; +enum flow_offload_tuple_dir { + FLOW_OFFLOAD_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY, + __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, +}; +#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) + +struct flow_offload_tuple { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + struct { + __be16 src_port; + __be16 dst_port; + }; + + u8 l3proto; + u8 l4proto; + u8 dir; + + int iifidx; + int oifidx; + + union { + __be32 gateway; + struct in6_addr gateway6; + }; +}; + +struct flow_offload_tuple_rhash { + struct rhash_head node; + struct flow_offload_tuple tuple; +}; + +#define FLOW_OFFLOAD_SNAT 0x1 +#define FLOW_OFFLOAD_DNAT 0x2 +#define FLOW_OFFLOAD_DYING 0x4 + +struct flow_offload { + struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + u32 flags; + union { + /* Your private driver data here. */ + u32 timeout; + }; +}; + +struct flow_offload *flow_offload_alloc(struct nf_conn *ct, int *ifindex, + union nf_inet_addr *gw); +void flow_offload_free(const struct flow_offload *flow); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); +void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow); +struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple); +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data); +void nf_flow_offload_work_gc(struct work_struct *work); + +void flow_offload_dead(struct flow_offload *flow); + #endif /* _FLOW_OFFLOAD_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e4a13cc8a2e7..af0f58322515 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -649,6 +649,13 @@ endif # NF_TABLES_NETDEV endif # NF_TABLES +config NF_FLOW_TABLE + tristate "Netfilter flow table module" + help + This option adds the flow table core infrastructure. + + To compile it as a module, choose M here. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d3891c93edd6..1f7d92bd571a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -106,6 +106,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o +# flow table infrastructure +obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c new file mode 100644 index 000000000000..ff27dad268c3 --- /dev/null +++ b/net/netfilter/nf_flow_table.c @@ -0,0 +1,269 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +struct flow_offload_entry { + struct flow_offload flow; + struct nf_conn *ct; + struct rcu_head rcu_head; +}; + +struct flow_offload * +flow_offload_alloc(struct nf_conn *ct, int *iifindex, union nf_inet_addr *gw) +{ + struct flow_offload_entry *entry; + struct flow_offload *flow; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return NULL; + + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) { + kfree(entry); + return NULL; + } + entry->ct = ct; + + flow = &entry->flow; + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { + case NFPROTO_IPV4: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway = + gw[FLOW_OFFLOAD_DIR_ORIGINAL].ip; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway = + gw[FLOW_OFFLOAD_DIR_REPLY].ip; + break; + case NFPROTO_IPV6: + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.gateway6 = + gw[FLOW_OFFLOAD_DIR_ORIGINAL].in6; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.gateway6 = + gw[FLOW_OFFLOAD_DIR_REPLY].in6; + break; + } + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = + FLOW_OFFLOAD_DIR_ORIGINAL; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = + FLOW_OFFLOAD_DIR_REPLY; + + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = + iifindex[FLOW_OFFLOAD_DIR_ORIGINAL]; + flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = + iifindex[FLOW_OFFLOAD_DIR_REPLY]; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = + iifindex[FLOW_OFFLOAD_DIR_REPLY]; + flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = + iifindex[FLOW_OFFLOAD_DIR_ORIGINAL]; + + if (ct->status & IPS_SRC_NAT) + flow->flags |= FLOW_OFFLOAD_SNAT; + else if (ct->status & IPS_DST_NAT) + flow->flags |= FLOW_OFFLOAD_DNAT; + + return flow; +} +EXPORT_SYMBOL_GPL(flow_offload_alloc); + +void flow_offload_free(const struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + e = container_of(flow, struct flow_offload_entry, flow); + kfree(e); +} +EXPORT_SYMBOL_GPL(flow_offload_free); + +void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} +EXPORT_SYMBOL_GPL(flow_offload_dead); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) +{ + flow->timeout = (u32)jiffies; + + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + *flow_table->type->params); + rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[1].node, + *flow_table->type->params); + return 0; +} +EXPORT_SYMBOL_GPL(flow_offload_add); + +void flow_offload_del(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + *flow_table->type->params); + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[1].node, + *flow_table->type->params); + + e = container_of(flow, struct flow_offload_entry, flow); + kfree_rcu(e, rcu_head); +} +EXPORT_SYMBOL_GPL(flow_offload_del); + +struct flow_offload_tuple_rhash * +flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple) +{ + return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, + *flow_table->type->params); +} +EXPORT_SYMBOL_GPL(flow_offload_lookup); + +static void nf_flow_release_ct(const struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + e = container_of(flow, struct flow_offload_entry, flow); + nf_ct_delete(e->ct, 0, 0); + nf_ct_put(e->ct); +} + +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err; + + rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + iter(flow, data); + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + return err; +} +EXPORT_SYMBOL_GPL(nf_flow_table_iterate); + +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return (__s32)(flow->timeout - (u32)jiffies) <= 0; +} + +static inline bool nf_flow_is_dying(const struct flow_offload *flow) +{ + return flow->flags & FLOW_OFFLOAD_DYING; +} + +void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err, counter = 0; + + flow_table = container_of(work, struct nf_flowtable, gc_work.work); + + rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + err = PTR_ERR(tuplehash); + if (err != -EAGAIN) + goto out; + + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + if (nf_flow_has_expired(flow) || + nf_flow_is_dying(flow)) { + flow_offload_del(flow_table, flow); + nf_flow_release_ct(flow); + } + counter++; + } + + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); +out: + queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); +} +EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>"); -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html