The nf flowtable teardown forces a tcp state into established state with the corresponding timeout and is in a race condition with the conntrack code. This might happen even though the state is already in a CLOSE or FIN WAIT state and about to be closed. In order to process the correct state, a TCP connection needs to be set to established in the flowtable software and hardware case. Also this is a bit optimistic as we actually do not check for the 3 way handshake ACK at this point, we do not really have a choice. This is also fixing a race condition between the ct gc code and the flowtable teardown where the ct might already be removed when the flowtable teardown code runs. Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 87a7388b6c89..898ea2fc833e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -5,6 +5,7 @@ #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> +#include <linux/spinlock.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> @@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow, } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) -{ - tcp->state = TCP_CONNTRACK_ESTABLISHED; - tcp->seen[0].td_maxwin = 0; - tcp->seen[1].td_maxwin = 0; -} -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); s32 timeout; + spin_lock_bh(&ct->lock); + if (l4num == IPPROTO_TCP) { - struct nf_tcp_net *tn = nf_tcp_pernet(net); + ct->proto.tcp.seen[0].td_maxwin = 0; + ct->proto.tcp.seen[1].td_maxwin = 0; - timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; - timeout -= tn->offload_timeout; + if (nf_conntrack_tcp_established(ct)) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout -= tn->offload_timeout; + } } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); timeout = tn->timeouts[UDP_CT_REPLIED]; timeout -= tn->offload_timeout; } else { + spin_unlock_bh(&ct->lock); return; } @@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) ct->timeout = nfct_time_stamp + timeout; -} -static void flow_offload_fixup_ct_state(struct nf_conn *ct) -{ - if (nf_ct_protonum(ct) == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); -} - -static void flow_offload_fixup_ct(struct nf_conn *ct) -{ - flow_offload_fixup_ct_state(ct); - flow_offload_fixup_ct_timeout(ct); + spin_unlock_bh(&ct->lock); } static void flow_offload_route_release(struct flow_offload *flow) @@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); + flow_offload_fixup_ct(flow->ct); - if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(flow->ct); - else - flow_offload_fixup_ct_timeout(flow->ct); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); flow_offload_free(flow); } @@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { set_bit(NF_FLOW_TEARDOWN, &flow->flags); - - flow_offload_fixup_ct_state(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 889cf88d3dba..990128cb7a61 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -10,6 +10,7 @@ #include <linux/if_ether.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> +#include <linux/spinlock.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return -1; } + if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) { + spin_lock_bh(&flow->ct->lock); + flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED; + spin_unlock_bh(&flow->ct->lock); + set_bit(IPS_ASSURED_BIT, &flow->ct->status); + } + return 0; } diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b561e0a44a45..63bf1579e75f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -5,6 +5,7 @@ #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <linux/tc_act/tc_csum.h> +#include <linux/spinlock.h> #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> @@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload; + struct flow_offload_tuple *tuple; + struct flow_offload *flow; offload = container_of(work, struct flow_offload_work, work); switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); + /* Set the TCP connection to established or teardown does not work */ + flow = offload->flow; + tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple; + if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) { + spin_lock_bh(&flow->ct->lock); + flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED; + spin_unlock_bh(&flow->ct->lock); + set_bit(IPS_ASSURED_BIT, &flow->ct->status); + } break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); -- 2.33.1