I tested the patch in production yesterday and everything looks good. Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx> On Tue, May 17, 2022 at 01:04:53PM +0300, Oz Shlomo wrote: > Signed-off-by: Oz Shlomo <ozsh@xxxxxxxxxx> > > On 5/17/2022 12:42 PM, Pablo Neira Ayuso wrote: > > This patch addresses three possible problems: > > > > 1. ct gc may race to undo the timeout adjustment of the packet path, leaving > > the conntrack entry in place with the internal offload timeout (one day). > > > > 2. ct gc removes the ct because the IPS_OFFLOAD_BIT is not set and the CLOSE > > timeout is reached before the flow offload del. > > > > 3. tcp ct is always set to ESTABLISHED with a very long timeout > > in flow offload teardown/delete even though the state might be already > > CLOSED. Also as a remark we cannot assume that the FIN or RST packet > > is hitting flow table teardown as the packet might get bumped to the > > slow path in nftables. > > > > This patch resets IPS_OFFLOAD_BIT from flow_offload_teardown(), so > > conntrack handles the tcp rst/fin packet which triggers the CLOSE/FIN > > state transition. > > > > Moreover, teturn the connection's ownership to conntrack upon teardown > > by clearing the offload flag and fixing the established timeout value. > > The flow table GC thread will asynchonrnously free the flow table and > > hardware offload entries. > > > > Before this patch, the IPS_OFFLOAD_BIT remained set for expired flows on > > which is also misleading since the flow is back to classic conntrack > > path. > > > > If nf_ct_delete() removes the entry from the conntrack table, then it > > calls nf_ct_put() which decrements the refcnt. This is not a problem > > because the flowtable holds a reference to the conntrack object from > > flow_offload_alloc() path which is released via flow_offload_free(). > > > > This patch also updates nft_flow_offload to skip packets in SYN_RECV > > state. Since we might miss or bump packets to slow path, we do not know > > what will happen there while we are still in SYN_RECV, this patch > > postpones offload up to the next packet which also aligns to the > > existing behaviour in tc-ct. > > > > flow_offload_teardown() does not reset the existing tcp state from > > flow_offload_fixup_tcp() to ESTABLISHED anymore, packets bump to slow > > path might have already update the state to CLOSE/FIN. > > > > Joint work with Oz and Sven. > > > > Fixes: 1e5b2471bcc4 ("netfilter: nf_flow_table: teardown flow timeout race") > > Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> > > --- > > v2: fix nf_conntrack_tcp_established() call, reported by Oz > > > > net/netfilter/nf_flow_table_core.c | 33 +++++++----------------------- > > net/netfilter/nft_flow_offload.c | 3 ++- > > 2 files changed, 9 insertions(+), 27 deletions(-) > > > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c > > index 20b4a14e5d4e..ebdf5332e838 100644 > > --- a/net/netfilter/nf_flow_table_core.c > > +++ b/net/netfilter/nf_flow_table_core.c > > @@ -179,12 +179,11 @@ EXPORT_SYMBOL_GPL(flow_offload_route_init); > > static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) > > { > > - tcp->state = TCP_CONNTRACK_ESTABLISHED; > > tcp->seen[0].td_maxwin = 0; > > tcp->seen[1].td_maxwin = 0; > > } > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) > > +static void flow_offload_fixup_ct(struct nf_conn *ct) > > { > > struct net *net = nf_ct_net(ct); > > int l4num = nf_ct_protonum(ct); > > @@ -193,7 +192,9 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) > > if (l4num == IPPROTO_TCP) { > > struct nf_tcp_net *tn = nf_tcp_pernet(net); > > - timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; > > + flow_offload_fixup_tcp(&ct->proto.tcp); > > + > > + timeout = tn->timeouts[ct->proto.tcp.state]; > > timeout -= tn->offload_timeout; > > } else if (l4num == IPPROTO_UDP) { > > struct nf_udp_net *tn = nf_udp_pernet(net); > > @@ -211,18 +212,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) > > WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); > > } > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct) > > -{ > > - if (nf_ct_protonum(ct) == IPPROTO_TCP) > > - flow_offload_fixup_tcp(&ct->proto.tcp); > > -} > > - > > -static void flow_offload_fixup_ct(struct nf_conn *ct) > > -{ > > - flow_offload_fixup_ct_state(ct); > > - flow_offload_fixup_ct_timeout(ct); > > -} > > - > > static void flow_offload_route_release(struct flow_offload *flow) > > { > > nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); > > @@ -361,22 +350,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table, > > rhashtable_remove_fast(&flow_table->rhashtable, > > &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, > > nf_flow_offload_rhash_params); > > - > > - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); > > - > > - if (nf_flow_has_expired(flow)) > > - flow_offload_fixup_ct(flow->ct); > > - else > > - flow_offload_fixup_ct_timeout(flow->ct); > > - > > flow_offload_free(flow); > > } > > void flow_offload_teardown(struct flow_offload *flow) > > { > > + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); > > set_bit(NF_FLOW_TEARDOWN, &flow->flags); > > - > > - flow_offload_fixup_ct_state(flow->ct); > > + flow_offload_fixup_ct(flow->ct); > > } > > EXPORT_SYMBOL_GPL(flow_offload_teardown); > > @@ -466,7 +447,7 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, > > if (nf_flow_has_expired(flow) || > > nf_ct_is_dying(flow->ct) || > > nf_flow_has_stale_dst(flow)) > > - set_bit(NF_FLOW_TEARDOWN, &flow->flags); > > + flow_offload_teardown(flow); > > if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { > > if (test_bit(NF_FLOW_HW, &flow->flags)) { > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c > > index 187b8cb9a510..6f0b07fe648d 100644 > > --- a/net/netfilter/nft_flow_offload.c > > +++ b/net/netfilter/nft_flow_offload.c > > @@ -298,7 +298,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, > > case IPPROTO_TCP: > > tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), > > sizeof(_tcph), &_tcph); > > - if (unlikely(!tcph || tcph->fin || tcph->rst)) > > + if (unlikely(!tcph || tcph->fin || tcph->rst || > > + !nf_conntrack_tcp_established(ct))) > > goto out; > > break; > > case IPPROTO_UDP: