Re: [PATCH] nf_flowtable: teardown fix race condition

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Sven,

It seems to me like the issue should be resolved from nft side rather than the flowtable side.

On 5/9/2022 12:31 PM, Sven Auhagen wrote:
The nf flowtable teardown forces a tcp state into established state
with the corresponding timeout and is in a race condition with
the conntrack code.
This might happen even though the state is already in a CLOSE or
FIN WAIT state and about to be closed.
In order to process the correct state, a TCP connection needs to be
set to established in the flowtable software and hardware case.
Also this is a bit optimistic as we actually do not check for the
3 way handshake ACK at this point, we do not really have a choice.

This is also fixing a race condition between the ct gc code
and the flowtable teardown where the ct might already be removed
when the flowtable teardown code runs >
Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx>

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 87a7388b6c89..898ea2fc833e 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -5,6 +5,7 @@
  #include <linux/netfilter.h>
  #include <linux/rhashtable.h>
  #include <linux/netdevice.h>
+#include <linux/spinlock.h>
  #include <net/ip.h>
  #include <net/ip6_route.h>
  #include <net/netfilter/nf_tables.h>
@@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
  }
  EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
-{
-	tcp->state = TCP_CONNTRACK_ESTABLISHED;
-	tcp->seen[0].td_maxwin = 0;
-	tcp->seen[1].td_maxwin = 0;
-}
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
  {
  	struct net *net = nf_ct_net(ct);
  	int l4num = nf_ct_protonum(ct);
  	s32 timeout;
+ spin_lock_bh(&ct->lock);
+
  	if (l4num == IPPROTO_TCP) {
-		struct nf_tcp_net *tn = nf_tcp_pernet(net);
+		ct->proto.tcp.seen[0].td_maxwin = 0;
+		ct->proto.tcp.seen[1].td_maxwin = 0;
- timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
-		timeout -= tn->offload_timeout;
+		if (nf_conntrack_tcp_established(ct)) {
+			struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+			timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+			timeout -= tn->offload_timeout;
+		}
  	} else if (l4num == IPPROTO_UDP) {
  		struct nf_udp_net *tn = nf_udp_pernet(net);
timeout = tn->timeouts[UDP_CT_REPLIED];
  		timeout -= tn->offload_timeout;
  	} else {
+		spin_unlock_bh(&ct->lock);
  		return;
  	}
@@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
  		ct->timeout = nfct_time_stamp + timeout;
-}
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
-	if (nf_ct_protonum(ct) == IPPROTO_TCP)
-		flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
-	flow_offload_fixup_ct_state(ct);
-	flow_offload_fixup_ct_timeout(ct);
+	spin_unlock_bh(&ct->lock);
  }
static void flow_offload_route_release(struct flow_offload *flow)
@@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
  			       nf_flow_offload_rhash_params);
- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+	flow_offload_fixup_ct(flow->ct);
- if (nf_flow_has_expired(flow))
-		flow_offload_fixup_ct(flow->ct);
-	else
-		flow_offload_fixup_ct_timeout(flow->ct);
+	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
flow_offload_free(flow);
  }
@@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
  void flow_offload_teardown(struct flow_offload *flow)
  {
  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
-	flow_offload_fixup_ct_state(flow->ct);
  }
  EXPORT_SYMBOL_GPL(flow_offload_teardown);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 889cf88d3dba..990128cb7a61 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -10,6 +10,7 @@
  #include <linux/if_ether.h>
  #include <linux/if_pppox.h>
  #include <linux/ppp_defs.h>
+#include <linux/spinlock.h>
  #include <net/ip.h>
  #include <net/ipv6.h>
  #include <net/ip6_route.h>
@@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
  		return -1;
  	}
+ if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
+		spin_lock_bh(&flow->ct->lock);
+		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+		spin_unlock_bh(&flow->ct->lock);
+		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+	}
+
  	return 0;
  }
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b561e0a44a45..63bf1579e75f 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -5,6 +5,7 @@
  #include <linux/rhashtable.h>
  #include <linux/netdevice.h>
  #include <linux/tc_act/tc_csum.h>
+#include <linux/spinlock.h>
  #include <net/flow_offload.h>
  #include <net/netfilter/nf_flow_table.h>
  #include <net/netfilter/nf_tables.h>
@@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
  static void flow_offload_work_handler(struct work_struct *work)
  {
  	struct flow_offload_work *offload;
+	struct flow_offload_tuple *tuple;
+	struct flow_offload *flow;
offload = container_of(work, struct flow_offload_work, work);
  	switch (offload->cmd) {
  		case FLOW_CLS_REPLACE:
  			flow_offload_work_add(offload);
+			/* Set the TCP connection to established or teardown does not work */
+			flow = offload->flow;
+			tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
+			if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
+				spin_lock_bh(&flow->ct->lock);
+				flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
+				spin_unlock_bh(&flow->ct->lock);
+				set_bit(IPS_ASSURED_BIT, &flow->ct->status);
+			}

Hmm, this looks like a workaround.
Also note that this code is called only when the flowtable NF_FLOWTABLE_HW_OFFLOAD bit is set.

  			break;
  		case FLOW_CLS_DESTROY:
  			flow_offload_work_del(offload);



[Index of Archives]     [Netfitler Users]     [Berkeley Packet Filter]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux