Connections leaving the established state (due to RST / FIN TCP packets) set the flow table teardown flag. The packet path continues to set lower timeout value as per the new TCP state but the offload flag remains set. Hence, the conntrack garbage collector may race to undo the timeout adjustment of the packet path, leaving the conntrack entry in place with the internal offload timeout (one day). Avoid ct gc timeout overwrite by flagging teared down flowtable connections. On the nftables side we only need to allow established TCP connections to create a flow offload entry. Since we can not guaruantee that flow_offload_teardown is called by a TCP FIN packet we also need to make sure that flow_offload_fixup_ct is also called in flow_offload_del and only fixes up established TCP connections. Fixes: 1e5b2471bcc4 ("netfilter: nf_flow_table: teardown flow timeout race") Signed-off-by: Oz Shlomo <ozsh@xxxxxxxxxx> Signed-off-by: Sven Auhagen <sven.auhagen@xxxxxxxxxxxx> ------------ v1 -> v2 changes - Add flow_teardown flag - Add nftables handling - Fixup timeout according to the current ct state --- include/uapi/linux/netfilter/nf_conntrack_common.h | 6 +++- net/netfilter/nf_conntrack_core.c | 3 +- net/netfilter/nf_flow_table_core.c | 42 +++++++++------------- net/netfilter/nft_flow_offload.c | 2 ++ 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 26071021e986..bb06202a4965 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -118,6 +118,10 @@ enum ip_conntrack_status { IPS_HW_OFFLOAD_BIT = 15, IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT), + /* offloaded conntrack entry is marked for deletion. */ + IPS_OFFLOAD_TEARDOWN_BIT = 16, + IPS_OFFLOAD_TEARDOWN = (1 << IPS_OFFLOAD_TEARDOWN_BIT), + /* Be careful here, modifying these bits can make things messy, * so don't let users modify them directly. */ @@ -126,7 +130,7 @@ enum ip_conntrack_status { IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | IPS_OFFLOAD | IPS_HW_OFFLOAD), - __IPS_MAX_BIT = 16, + __IPS_MAX_BIT = 17, }; /* Connection tracking event types */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0164e5f522e8..324fdb62c08b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); + if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status)) + nf_ct_offload_timeout(tmp); continue; } diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 3db256da919b..aaed1a244013 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow, } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) -{ - tcp->state = TCP_CONNTRACK_ESTABLISHED; - tcp->seen[0].td_maxwin = 0; - tcp->seen[1].td_maxwin = 0; -} -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + tcp->seen[0].td_maxwin = 0; + tcp->seen[1].td_maxwin = 0; - timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout = tn->timeouts[ct->proto.tcp.state]; timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } -static void flow_offload_fixup_ct_state(struct nf_conn *ct) -{ - if (nf_ct_protonum(ct) == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); -} - -static void flow_offload_fixup_ct(struct nf_conn *ct) -{ - flow_offload_fixup_ct_state(ct); - flow_offload_fixup_ct_timeout(ct); -} - static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { + struct nf_conn *ct = flow->ct; + + set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status); + rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(flow->ct); - else - flow_offload_fixup_ct_timeout(flow->ct); + flow_offload_fixup_ct(ct); + + clear_bit(IPS_OFFLOAD_BIT, &ct->status); + clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status); flow_offload_free(flow); } @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { set_bit(NF_FLOW_TEARDOWN, &flow->flags); + set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status); - flow_offload_fixup_ct_state(flow->ct); + flow_offload_fixup_ct(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 900d48c810a1..9cc3ea08eb3a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst)) goto out; + if (unlikely(!nf_conntrack_tcp_established(ct))) + goto out; break; case IPPROTO_UDP: break; -- 1.8.3.1