On Wed, 2024-02-21 at 17:45 +0100, Valentin Schneider wrote: > On 20/02/24 18:42, Eric Dumazet wrote: > > On Tue, Feb 20, 2024 at 6:38 PM Valentin Schneider <vschneid@xxxxxxxxxx> wrote: > > > Hm so that would indeed prevent a concurrent inet_twsk_schedule() from > > > re-arming the timer, but in case the calls are interleaved like so: > > > > > > tcp_time_wait() > > > inet_twsk_hashdance() > > > inet_twsk_deschedule_put() > > > timer_shutdown_sync() > > > inet_twsk_schedule() > > > > > > inet_twsk_hashdance() will have left the refcounts including a count for > > > the timer, and we won't arm the timer to clear it via the timer callback > > > (via inet_twsk_kill()) - the patch in its current form relies on the timer > > > being re-armed for that. > > > > > > I don't know if there's a cleaner way to do this, but we could catch that > > > in inet_twsk_schedule() and issue the inet_twsk_kill() directly if we can > > > tell the timer has been shutdown: > > > --- > > > diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c > > > index 61a053fbd329c..c272da5046bb4 100644 > > > --- a/net/ipv4/inet_timewait_sock.c > > > +++ b/net/ipv4/inet_timewait_sock.c > > > @@ -227,7 +227,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) > > > * have already gone through {tcp,dcpp}_time_wait(), and we can safely > > > * call inet_twsk_kill(). > > > */ > > > - if (del_timer_sync(&tw->tw_timer)) > > > + if (timer_shutdown_sync(&tw->tw_timer)) > > > inet_twsk_kill(tw); > > > inet_twsk_put(tw); > > > } > > > @@ -267,6 +267,10 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) > > > LINUX_MIB_TIMEWAITED); > > > BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); > > > > Would not a shutdown timer return a wrong mod_timer() value here ? > > > > Instead of BUG_ON(), simply release the refcount ? > > > > Unfortunately a shutdown timer would return the same as a non-shutdown one: > > * Return: > * * %0 - The timer was inactive and started or was in shutdown > * state and the operation was discarded > > and now that you've pointed this out, I realize it's racy to check the > state of the timer after the mod_timer(): > > BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); > inet_twsk_deschedule_put() > timer_shutdown_sync() > inet_twsk_kill() > if (!tw->tw_timer.function) > inet_twsk_kill() > > > I've looked into messing about with the return values of mod_timer() to get > the info that the timer was shutdown, but the only justification for this > is that here we rely on the timer_base lock to serialize > inet_twsk_schedule() vs inet_twsk_deschedule_put(). > > AFAICT the alternative is adding local serialization like so, which I'm not > the biggest fan of but couldn't think of a neater approach: > --- > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h > index f28da08a37b4e..39bb0c148d4ee 100644 > --- a/include/net/inet_timewait_sock.h > +++ b/include/net/inet_timewait_sock.h > @@ -75,6 +75,7 @@ struct inet_timewait_sock { > struct timer_list tw_timer; > struct inet_bind_bucket *tw_tb; > struct inet_bind2_bucket *tw_tb2; > + struct spinlock tw_timer_lock; > }; > #define tw_tclass tw_tos > > diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c > index 61a053fbd329c..2471516f9c61d 100644 > --- a/net/ipv4/inet_timewait_sock.c > +++ b/net/ipv4/inet_timewait_sock.c > @@ -193,6 +193,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, > atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); > twsk_net_set(tw, sock_net(sk)); > timer_setup(&tw->tw_timer, tw_timer_handler, 0); > + spin_lock_init(&tw->tw_timer_lock); > /* > * Because we use RCU lookups, we should not set tw_refcnt > * to a non null value before everything is setup for this > @@ -227,8 +228,11 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) > * have already gone through {tcp,dcpp}_time_wait(), and we can safely > * call inet_twsk_kill(). > */ > - if (del_timer_sync(&tw->tw_timer)) > + spin_lock(&tw->tw_timer_lock); > + if (timer_shutdown_sync(&tw->tw_timer)) > inet_twsk_kill(tw); > + spin_unlock(&tw->tw_timer_lock); > + > inet_twsk_put(tw); > } > EXPORT_SYMBOL(inet_twsk_deschedule_put); > @@ -262,11 +266,25 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) > > if (!rearm) { > bool kill = timeo <= 4*HZ; > + bool pending; > > __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : > LINUX_MIB_TIMEWAITED); > + spin_lock(&tw->tw_timer_lock); > BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); > + pending = timer_pending(&tw->tw_timer); > refcount_inc(&tw->tw_dr->tw_refcount); > + > + /* > + * If the timer didn't become pending under tw_timer_lock, then > + * it means it has been shutdown by inet_twsk_deschedule_put() > + * prior to this invocation. All that remains is to clean up the > + * timewait. > + */ > + if (!pending) > + inet_twsk_kill(tw); > + > + spin_unlock(&tw->tw_timer_lock); > } else { > mod_timer_pending(&tw->tw_timer, jiffies + timeo); > } I understand that adding another lock here is a no-go. I'm wondering if we could move the inet_twsk_schedule() under the ehash lock, to avoid the need for acquiring the additional tw reference and thus avoid the ref leak when inet_twsk_deschedule_put() clashes with tcp_time_wait(). Something alike the following (completely untested!!!): WDYT? --- diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index f28da08a37b4..d696d10dc8ae 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -93,8 +93,10 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state); -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo); +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 64d805b27add..8e108a89d8e4 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -58,11 +58,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) * we complete the initialization. */ local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e8de45d34d56..dd314b06c0cd 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -97,8 +97,10 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo) +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -135,6 +137,8 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + inet_twsk_schedule(tw, timeo); + spin_unlock(lock); /* tw_refcnt is set to 3 because we have : @@ -148,7 +152,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ refcount_set(&tw->tw_refcnt, 3); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -217,7 +221,7 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { - if (del_timer_sync(&tw->tw_timer)) + if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0761f060a83..8d5ec48a1837 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -343,11 +343,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * we complete the initialization. */ local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); + inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this