Eric Dumazet a écrit : > Eric Dumazet a écrit : >> But even if sysctl_tw_reuse is cleared, we might trigger the bug if >> local port is bound to a value. > > Oh well, that's more subtle than that. > > __inet_check_established() is called not only with bh disabled, > but also with a lock on bind list if twp != NULL. > > However, if twp is NULL, lock is not held by caller. > > [ Thats the final > ret = check_established(death_row, sk, snum, NULL); > in __inet_hash_connect()] > > So triggering this bug with tw_reuse clear is tricky : > > You need several threads, using sockets with REUSEADDR set, > and bind() to same address/port before connect() to same target. > > We need another patch to correct this. > Here is a separate patch for this issue, cooked on top of net-next-2.6 for testing purposes, and public discussion. Thanks [PATCH net-next-2.6] tcp: connect() race with timewait reuse Its currently possible that several threads issuing a connect() find the same timewait socket and try to reuse it, leading to list corruptions. Condition for bug is that these threads bound their socket on same address/port of to be found timewait socket, and connected to same target. (SO_REUSEADDR needed) To fix this problem, we could unhash timewait socket while holding ehash lock, to make sure lookups/changes will be serialized. Only first one find the timewait socket, other ones find the established socket and return an EADDRNOTAVAIL error. Signed-off-by: Eric Dumazet <eric.dumazet@xxxxxxxxx> --- include/net/inet_timewait_sock.h | 2 + net/ipv4/inet_hashtables.c | 7 +++-- net/ipv4/inet_timewait_sock.c | 36 ++++++++++++++++++++--------- net/ipv6/inet6_hashtables.c | 12 +++++---- 4 files changed, 39 insertions(+), 18 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 773b10f..59c80a0 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -199,6 +199,8 @@ static inline __be32 inet_rcv_saddr(const struct sock *sk) extern void inet_twsk_put(struct inet_timewait_sock *tw); +extern void inet_twsk_unhash(struct inet_timewait_sock *tw); + extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 94ef51a..143ddb4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -318,20 +318,21 @@ unique: sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } - return 0; not_unique: diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f5d508..680d09b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -14,6 +14,21 @@ #include <net/inet_timewait_sock.h> #include <net/ip.h> + +/* + * unhash a timewait socket from established hash + * lock must be hold by caller + */ +void inet_twsk_unhash(struct inet_timewait_sock *tw) +{ + if (hlist_nulls_unhashed(&tw->tw_node)) + return; + + hlist_nulls_del_rcu(&tw->tw_node); + sk_nulls_node_init(&tw->tw_node); + inet_twsk_put(tw); +} + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -24,12 +39,9 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); spin_lock(lock); - if (hlist_nulls_unhashed(&tw->tw_node)) { - spin_unlock(lock); - return; - } - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); + + inet_twsk_unhash(tw); + spin_unlock(lock); /* Disassociate with bind bucket. */ @@ -37,9 +49,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, hashinfo->bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + if (tb) { + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + } spin_unlock(&bhead->lock); #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { @@ -47,7 +61,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif - inet_twsk_put(tw); + if (tb) + inet_twsk_put(tw); } static noinline void inet_twsk_free(struct inet_timewait_sock *tw) @@ -92,6 +107,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + atomic_inc(&tw->tw_refcnt); spin_unlock(&bhead->lock); spin_lock(lock); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 00c6a3e..3681c00 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -250,19 +250,21 @@ unique: * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); + sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); - sk->sk_hash = hash; + if (tw) { + inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - if (twp != NULL) { + if (twp) { *twp = tw; - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { + } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } -- To unsubscribe from this list: send the line "unsubscribe netfilter" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html