Replace the offset-based approach for tracking progress through a bucket in the TCP table with one based on unique, monotonically increasing index numbers associated with each socket in a bucket. Signed-off-by: Jordan Rife <jrife@xxxxxxxxxx> --- include/net/inet_hashtables.h | 2 ++ include/net/tcp.h | 3 ++- net/ipv4/inet_hashtables.c | 18 +++++++++++++++--- net/ipv4/tcp.c | 1 + net/ipv4/tcp_ipv4.c | 29 ++++++++++++++++------------- 5 files changed, 36 insertions(+), 17 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a4..c95d3b1da199 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -172,6 +172,8 @@ struct inet_hashinfo { struct inet_listen_hashbucket *lhash2; bool pernet; + + atomic64_t ver; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d08473a6dc0..499acd6da35f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2202,7 +2202,8 @@ struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; - int bucket, offset, sbucket, num; + int bucket, sbucket, num; + __s64 prev_idx; loff_t last_pos; }; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..bc9f58172790 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -534,6 +534,12 @@ struct sock *__inet_lookup_established(const struct net *net, } EXPORT_SYMBOL_GPL(__inet_lookup_established); +static inline __s64 inet_hashinfo_next_idx(struct inet_hashinfo *hinfo, + bool pos) +{ + return (pos ? 1 : -1) * atomic64_inc_return(&hinfo->ver); +} + /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, @@ -581,6 +587,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + sk->sk_idx = inet_hashinfo_next_idx(hinfo, false); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); @@ -678,8 +685,10 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) ret = false; } - if (ret) + if (ret) { __sk_nulls_add_node_rcu(sk, list); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, false); + } spin_unlock(lock); @@ -729,6 +738,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; + bool add_tail; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -747,11 +757,13 @@ int __inet_hash(struct sock *sk, struct sock *osk) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6; + if (add_tail) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, add_tail); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 285678d8ce07..63693af0c05c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5147,6 +5147,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; + atomic64_set(&tcp_hashinfo.ver, 0); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2632844d2c35..d0ddb307e2a1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2602,7 +2602,7 @@ static void *listening_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; @@ -2637,7 +2637,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { @@ -2658,7 +2658,6 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) void *rc; st->bucket = 0; - st->offset = 0; rc = listening_get_first(seq); while (rc && *pos) { @@ -2683,7 +2682,7 @@ static void *established_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; @@ -2714,7 +2713,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); @@ -2763,8 +2762,8 @@ static void *tcp_seek_last_pos(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; + __s64 prev_idx = st->prev_idx; int bucket = st->bucket; - int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2773,18 +2772,21 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > hinfo->lhash2_mask) break; rc = listening_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = listening_get_next(seq, rc); if (rc) break; st->bucket = 0; + prev_idx = 0; st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: if (st->bucket > hinfo->ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = established_get_next(seq, rc); } @@ -2807,7 +2809,7 @@ void *tcp_seq_start(struct seq_file *seq, loff_t *pos) st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; out: @@ -2832,7 +2834,7 @@ void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = established_get_first(seq); } break; @@ -3124,7 +3126,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) * it has to advance to the next bucket. */ if (iter->st_bucket_done) { - st->offset = 0; + st->prev_idx = 0; st->bucket++; if (st->state == TCP_SEQ_STATE_LISTENING && st->bucket > hinfo->lhash2_mask) { @@ -3192,8 +3194,9 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * the future start() will resume at st->offset in * st->bucket. See tcp_seek_last_pos(). */ - st->offset++; - sock_gen_put(iter->batch[iter->cur_sk++]); + sk = iter->batch[iter->cur_sk++]; + st->prev_idx = sk->sk_idx; + sock_gen_put(sk); } if (iter->cur_sk < iter->end_sk) -- 2.49.0.rc1.451.g8f38331e32-goog