The nf flowtable teardown forces a tcp state into established state
with the corresponding timeout and is in a race condition with
the conntrack code.
This might happen even though the state is already in a CLOSE or
FIN WAIT state and about to be closed.
In order to process the correct state, a TCP connection needs to be
set to established in the flowtable software and hardware case.
Also this is a bit optimistic as we actually do not check for the
3 way handshake ACK at this point, we do not really have a choice.
This is also fixing a race condition between the ct gc code
and the flowtable teardown where the ct might already be removed
when the flowtable teardown code runs >
Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx>
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 87a7388b6c89..898ea2fc833e 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -5,6 +5,7 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
+#include <linux/spinlock.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/netfilter/nf_tables.h>
@@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
-{
- tcp->state = TCP_CONNTRACK_ESTABLISHED;
- tcp->seen[0].td_maxwin = 0;
- tcp->seen[1].td_maxwin = 0;
-}
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
s32 timeout;
+ spin_lock_bh(&ct->lock);
+
if (l4num == IPPROTO_TCP) {
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ ct->proto.tcp.seen[0].td_maxwin = 0;
+ ct->proto.tcp.seen[1].td_maxwin = 0;
- timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
- timeout -= tn->offload_timeout;
+ if (nf_conntrack_tcp_established(ct)) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ timeout -= tn->offload_timeout;
+ }
} else if (l4num == IPPROTO_UDP) {
struct nf_udp_net *tn = nf_udp_pernet(net);
timeout = tn->timeouts[UDP_CT_REPLIED];
timeout -= tn->offload_timeout;
} else {
+ spin_unlock_bh(&ct->lock);
return;
}
@@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
ct->timeout = nfct_time_stamp + timeout;
-}
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
- if (nf_ct_protonum(ct) == IPPROTO_TCP)
- flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
- flow_offload_fixup_ct_state(ct);
- flow_offload_fixup_ct_timeout(ct);
+ spin_unlock_bh(&ct->lock);
}
static void flow_offload_route_release(struct flow_offload *flow)
@@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+ flow_offload_fixup_ct(flow->ct);
- if (nf_flow_has_expired(flow))
- flow_offload_fixup_ct(flow->ct);
- else
- flow_offload_fixup_ct_timeout(flow->ct);
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
flow_offload_free(flow);
}
@@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
void flow_offload_teardown(struct flow_offload *flow)
{
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
- flow_offload_fixup_ct_state(flow->ct);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 889cf88d3dba..990128cb7a61 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -10,6 +10,7 @@
#include <linux/if_ether.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
+#include <linux/spinlock.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return -1;
}
+ if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
+ spin_lock_bh(&flow->ct->lock);
+ flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+ spin_unlock_bh(&flow->ct->lock);
+ set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+ }
+
return 0;
}
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b561e0a44a45..63bf1579e75f 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -5,6 +5,7 @@
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
#include <linux/tc_act/tc_csum.h>
+#include <linux/spinlock.h>
#include <net/flow_offload.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
@@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
static void flow_offload_work_handler(struct work_struct *work)
{
struct flow_offload_work *offload;
+ struct flow_offload_tuple *tuple;
+ struct flow_offload *flow;
offload = container_of(work, struct flow_offload_work, work);
switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
+ /* Set the TCP connection to established or teardown does not work */
+ flow = offload->flow;
+ tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
+ if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
+ spin_lock_bh(&flow->ct->lock);
+ flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+ spin_unlock_bh(&flow->ct->lock);
+ set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+ }