It is possible that two concurrent packets originating from the same socket of a connection-less protocol (e.g. UDP) can end up having different IP_CT_DIR_REPLY tuples which results in one of the packets being dropped. To illustrate this, consider the following simplified scenario: 1. No DNAT/SNAT/MASQUEARADE rules are installed, but the nf_nat module is loaded. 2. Packet A and B are sent at the same time from two different threads via the same UDP socket which hasn't been used before (=no CT has been created before). Both packets have the same IP_CT_DIR_ORIGINAL tuple. 3. CT of A has been created and confirmed, afterwards get_unique_tuple is called for B. Because IP_CT_DIR_REPLY tuple (the inverse of the IP_CT_DIR_ORIGINAL tuple) is already taken by the A's confirmed CT (nf_nat_used_tuple finds it), get_unique_tuple calls UDP's unique_tuple which returns a different IP_CT_DIR_REPLY tuple (usually with src port = 1024) 4. B's CT cannot get confirmed in __nf_conntrack_confirm due to the found IP_CT_DIR_ORIGINAL tuple of A and the different IP_CT_DIR_REPLY tuples, thus the packet B gets dropped. This patch modifies get_unique_tuple in a way that the function might return the already used by a confirmed CT reply tuple if a L4 protocol allows the clash resolution and IP_CT_DIR_ORIGINAL tuples are equal. Signed-off-by: Martynas Pumputis <martynas@weave.works> --- I've tested the patch with https://github.com/brb/conntrack-race in three different scenarios (no NAT, SNAT, DNAT) by checking "insert_failed" and "drop" counters, PCAP traces and dynamic debug logs. --- include/net/netfilter/nf_conntrack.h | 5 ++-- include/net/netfilter/nf_nat.h | 3 ++- net/ipv4/netfilter/nf_nat_proto_gre.c | 2 +- net/ipv4/netfilter/nf_nat_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_nat_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 12 ++++++--- net/netfilter/nf_nat_core.c | 34 +++++++++++++++++++----- net/netfilter/nf_nat_proto_common.c | 2 +- 8 files changed, 46 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 062dc19b5840..498d5d8159f5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -135,8 +135,9 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ -int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack); +int nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + bool ignore_same_orig); #define NFCT_INFOMASK 7UL #define NFCT_PTRMASK ~(NFCT_INFOMASK) diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index a17eb2f8d40e..fee9737a65a7 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -49,7 +49,8 @@ struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct); /* Is this tuple already taken? (not by us)*/ int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack); + const struct nf_conn *ignored_conntrack, + bool ignore_same_orig); static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 00fda6331ce5..c3083b68d3c2 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -72,7 +72,7 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto, for (i = 0; ; ++key) { *keyptr = htons(min + key % range_size); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false)) return; } diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 6d7cf1d79baf..589e9a9b5509 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -47,7 +47,7 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + (id % range_size)); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false)) return; } return; diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index d9bf42ba44fa..cf47f5f549ee 100644 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -49,7 +49,7 @@ icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + (id % range_size)); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false)) return; } } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cc5ef8c9d0da..43a53eaff82c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -848,8 +848,9 @@ EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); /* Returns true if a connection correspondings to the tuple (required for NAT). */ int -nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack) +nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + bool ignore_same_orig) { struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; @@ -878,6 +879,11 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, } if (nf_ct_key_equal(h, tuple, zone, net)) { + if (ignore_same_orig && + nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + continue; + } NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; @@ -893,7 +899,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, return 0; } -EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); +EXPORT_SYMBOL_GPL(nf_conntrack_reply_tuple_taken); #define NF_CT_EVICTION_RANGE 8 diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 46f9df99d276..12fb39d953e0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -154,7 +154,8 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) /* Is this tuple already taken? (not by us) */ int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, - const struct nf_conn *ignored_conntrack) + const struct nf_conn *ignored_conntrack, + bool ignore_same_orig) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, @@ -165,7 +166,15 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple reply; nf_ct_invert_tuplepr(&reply, tuple); - return nf_conntrack_tuple_taken(&reply, ignored_conntrack); + /* If ignore_same_orig is enabled, the following function will ignore + * any matching CT with the same IP_CT_DIR_ORIGINAL tuple. + * + * Used when calling the function for a CT of a connection-less protocol + * such as UDP to ignore a clashing CT which originated from the same + * socket. + */ + return nf_conntrack_reply_tuple_taken(&reply, ignored_conntrack, + ignore_same_orig); } EXPORT_SYMBOL(nf_nat_used_tuple); @@ -323,7 +332,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone; const struct nf_nat_l3proto *l3proto; const struct nf_nat_l4proto *l4proto; + const struct nf_conntrack_l4proto *ct_l4proto; struct net *net = nf_ct_net(ct); + bool ignore_same_orig = false; zone = nf_ct_zone(ct); @@ -331,6 +342,16 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, orig_tuple->dst.protonum); + ct_l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + + /* If the protocol allows the clash resolution, then when searching + * for clashing CTs ignore the ones with the same IP_CT_DIR_ORIGINAL + * tuple as they originate from the same socket. This prevents from + * generating different reply tuples for two racing packets from + * the same connection-less (e.g. UDP) socket which results in dropping + * one of the packets in __nf_conntrack_confirm. + */ + ignore_same_orig = ct_l4proto->allow_clash; /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given @@ -344,14 +365,15 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (in_range(l3proto, l4proto, orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { + if (!nf_nat_used_tuple(orig_tuple, ct, + ignore_same_orig)) { *tuple = *orig_tuple; goto out; } } else if (find_appropriate_src(net, zone, l3proto, l4proto, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); - if (!nf_nat_used_tuple(tuple, ct)) + if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig)) goto out; } } @@ -372,9 +394,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || - !nf_nat_used_tuple(tuple, ct))) + !nf_nat_used_tuple(tuple, ct, ignore_same_orig))) goto out; - } else if (!nf_nat_used_tuple(tuple, ct)) { + } else if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig)) { goto out; } } diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 5d849d835561..851517cdfbd7 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -91,7 +91,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, for (i = 0; ; ++off) { *portptr = htons(min + off % range_size); - if (++i != range_size && nf_nat_used_tuple(tuple, ct)) + if (++i != range_size && nf_nat_used_tuple(tuple, ct, false)) continue; if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL| NF_NAT_RANGE_PROTO_OFFSET))) -- 2.18.0 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html