Re: [PATCH] nf_flowtable: teardown fix race condition

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Oz,

On Mon, May 09, 2022 at 04:13:19PM +0300, Oz Shlomo wrote:
> Hi Sven,
> 
> It seems to me like the issue should be resolved from nft side rather than
> the flowtable side.

this is not possible to be resolved on the nftables side only.

> 
> On 5/9/2022 12:31 PM, Sven Auhagen wrote:
> > The nf flowtable teardown forces a tcp state into established state
> > with the corresponding timeout and is in a race condition with
> > the conntrack code.
> > This might happen even though the state is already in a CLOSE or
> > FIN WAIT state and about to be closed.
> > In order to process the correct state, a TCP connection needs to be
> > set to established in the flowtable software and hardware case.
> > Also this is a bit optimistic as we actually do not check for the
> > 3 way handshake ACK at this point, we do not really have a choice.
> > 
> > This is also fixing a race condition between the ct gc code
> > and the flowtable teardown where the ct might already be removed
> > when the flowtable teardown code runs >
> > Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx>
> > 
> > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > index 87a7388b6c89..898ea2fc833e 100644
> > --- a/net/netfilter/nf_flow_table_core.c
> > +++ b/net/netfilter/nf_flow_table_core.c
> > @@ -5,6 +5,7 @@
> >   #include <linux/netfilter.h>
> >   #include <linux/rhashtable.h>
> >   #include <linux/netdevice.h>
> > +#include <linux/spinlock.h>
> >   #include <net/ip.h>
> >   #include <net/ip6_route.h>
> >   #include <net/netfilter/nf_tables.h>
> > @@ -171,30 +172,32 @@ int flow_offload_route_init(struct flow_offload *flow,
> >   }
> >   EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > -{
> > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > -	tcp->seen[0].td_maxwin = 0;
> > -	tcp->seen[1].td_maxwin = 0;
> > -}
> > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> >   {
> >   	struct net *net = nf_ct_net(ct);
> >   	int l4num = nf_ct_protonum(ct);
> >   	s32 timeout;
> > +	spin_lock_bh(&ct->lock);
> > +
> >   	if (l4num == IPPROTO_TCP) {
> > -		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > +		ct->proto.tcp.seen[0].td_maxwin = 0;
> > +		ct->proto.tcp.seen[1].td_maxwin = 0;
> > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > -		timeout -= tn->offload_timeout;
> > +		if (nf_conntrack_tcp_established(ct)) {
> > +			struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > +
> > +			timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > +			timeout -= tn->offload_timeout;
> > +		}
> >   	} else if (l4num == IPPROTO_UDP) {
> >   		struct nf_udp_net *tn = nf_udp_pernet(net);
> >   		timeout = tn->timeouts[UDP_CT_REPLIED];
> >   		timeout -= tn->offload_timeout;
> >   	} else {
> > +		spin_unlock_bh(&ct->lock);
> >   		return;
> >   	}
> > @@ -203,18 +206,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> >   	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
> >   		ct->timeout = nfct_time_stamp + timeout;
> > -}
> > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > -{
> > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > -}
> > -
> > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > -{
> > -	flow_offload_fixup_ct_state(ct);
> > -	flow_offload_fixup_ct_timeout(ct);
> > +	spin_unlock_bh(&ct->lock);
> >   }
> >   static void flow_offload_route_release(struct flow_offload *flow)
> > @@ -354,12 +347,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >   			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> >   			       nf_flow_offload_rhash_params);
> > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > +	flow_offload_fixup_ct(flow->ct);
> > -	if (nf_flow_has_expired(flow))
> > -		flow_offload_fixup_ct(flow->ct);
> > -	else
> > -		flow_offload_fixup_ct_timeout(flow->ct);
> > +	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> >   	flow_offload_free(flow);
> >   }
> > @@ -367,8 +357,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> >   void flow_offload_teardown(struct flow_offload *flow)
> >   {
> >   	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > -
> > -	flow_offload_fixup_ct_state(flow->ct);
> >   }
> >   EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> > index 889cf88d3dba..990128cb7a61 100644
> > --- a/net/netfilter/nf_flow_table_ip.c
> > +++ b/net/netfilter/nf_flow_table_ip.c
> > @@ -10,6 +10,7 @@
> >   #include <linux/if_ether.h>
> >   #include <linux/if_pppox.h>
> >   #include <linux/ppp_defs.h>
> > +#include <linux/spinlock.h>
> >   #include <net/ip.h>
> >   #include <net/ipv6.h>
> >   #include <net/ip6_route.h>
> > @@ -34,6 +35,13 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
> >   		return -1;
> >   	}
> > +	if (unlikely(!test_bit(IPS_ASSURED_BIT, &flow->ct->status))) {
> > +		spin_lock_bh(&flow->ct->lock);
> > +		flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > +		spin_unlock_bh(&flow->ct->lock);
> > +		set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> > +	}
> > +
> >   	return 0;
> >   }
> > diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> > index b561e0a44a45..63bf1579e75f 100644
> > --- a/net/netfilter/nf_flow_table_offload.c
> > +++ b/net/netfilter/nf_flow_table_offload.c
> > @@ -5,6 +5,7 @@
> >   #include <linux/rhashtable.h>
> >   #include <linux/netdevice.h>
> >   #include <linux/tc_act/tc_csum.h>
> > +#include <linux/spinlock.h>
> >   #include <net/flow_offload.h>
> >   #include <net/netfilter/nf_flow_table.h>
> >   #include <net/netfilter/nf_tables.h>
> > @@ -953,11 +954,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
> >   static void flow_offload_work_handler(struct work_struct *work)
> >   {
> >   	struct flow_offload_work *offload;
> > +	struct flow_offload_tuple *tuple;
> > +	struct flow_offload *flow;
> >   	offload = container_of(work, struct flow_offload_work, work);
> >   	switch (offload->cmd) {
> >   		case FLOW_CLS_REPLACE:
> >   			flow_offload_work_add(offload);
> > +			/* Set the TCP connection to established or teardown does not work */
> > +			flow = offload->flow;
> > +			tuple = &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
> > +			if (tuple->l4proto == IPPROTO_TCP && !test_bit(IPS_ASSURED_BIT, &flow->ct->status)) {
> > +				spin_lock_bh(&flow->ct->lock);
> > +				flow->ct->proto.tcp.state = TCP_CONNTRACK_ESTABLISHED;
> > +				spin_unlock_bh(&flow->ct->lock);
> > +				set_bit(IPS_ASSURED_BIT, &flow->ct->status);
> > +			}
> 
> Hmm, this looks like a workaround.
> Also note that this code is called only when the flowtable
> NF_FLOWTABLE_HW_OFFLOAD bit is set.
> 

Yes, but as explained in my previous email, we need to set the TCP state to
established otherwise we have no chance of fixing up the TCP state in
flow_offload_del or we just do not know what has happened to the TCP state
between the time it was offloaded and it is know beeing processed by nftables
before the flowtable gc runs.



> >   			break;
> >   		case FLOW_CLS_DESTROY:
> >   			flow_offload_work_del(offload);



[Index of Archives]     [Netfitler Users]     [Berkeley Packet Filter]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux