[RFC PATCH bpf-next 2/3] bpf: tcp: Avoid socket skips during iteration

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Replace the offset-based approach for tracking progress through a bucket
in the TCP table with one based on unique, monotonically increasing
index numbers associated with each socket in a bucket.

Signed-off-by: Jordan Rife <jrife@xxxxxxxxxx>
---
 include/net/inet_hashtables.h |  2 ++
 include/net/tcp.h             |  3 ++-
 net/ipv4/inet_hashtables.c    | 18 +++++++++++++++---
 net/ipv4/tcp.c                |  1 +
 net/ipv4/tcp_ipv4.c           | 29 ++++++++++++++++-------------
 5 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5eea47f135a4..c95d3b1da199 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -172,6 +172,8 @@ struct inet_hashinfo {
 	struct inet_listen_hashbucket	*lhash2;
 
 	bool				pernet;
+
+	atomic64_t			ver;
 } ____cacheline_aligned_in_smp;
 
 static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d08473a6dc0..499acd6da35f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2202,7 +2202,8 @@ struct tcp_iter_state {
 	struct seq_net_private	p;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
-	int			bucket, offset, sbucket, num;
+	int			bucket, sbucket, num;
+	__s64			prev_idx;
 	loff_t			last_pos;
 };
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9bfcfd016e18..bc9f58172790 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -534,6 +534,12 @@ struct sock *__inet_lookup_established(const struct net *net,
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_established);
 
+static inline __s64 inet_hashinfo_next_idx(struct inet_hashinfo *hinfo,
+					   bool pos)
+{
+	return (pos ? 1 : -1) * atomic64_inc_return(&hinfo->ver);
+}
+
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
 				    struct sock *sk, __u16 lport,
@@ -581,6 +587,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	sk->sk_hash = hash;
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
+	sk->sk_idx = inet_hashinfo_next_idx(hinfo, false);
 	if (tw) {
 		sk_nulls_del_node_init_rcu((struct sock *)tw);
 		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
@@ -678,8 +685,10 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
 			ret = false;
 	}
 
-	if (ret)
+	if (ret) {
 		__sk_nulls_add_node_rcu(sk, list);
+		sk->sk_idx = inet_hashinfo_next_idx(hashinfo, false);
+	}
 
 	spin_unlock(lock);
 
@@ -729,6 +738,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
 	struct inet_listen_hashbucket *ilb2;
+	bool add_tail;
 	int err = 0;
 
 	if (sk->sk_state != TCP_LISTEN) {
@@ -747,11 +757,13 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 			goto unlock;
 	}
 	sock_set_flag(sk, SOCK_RCU_FREE);
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-		sk->sk_family == AF_INET6)
+	add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+		   sk->sk_family == AF_INET6;
+	if (add_tail)
 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
 	else
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+	sk->sk_idx = inet_hashinfo_next_idx(hashinfo, add_tail);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 285678d8ce07..63693af0c05c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5147,6 +5147,7 @@ void __init tcp_init(void)
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 	sysctl_tcp_max_orphans = cnt / 2;
+	atomic64_set(&tcp_hashinfo.ver, 0);
 
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2632844d2c35..d0ddb307e2a1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2602,7 +2602,7 @@ static void *listening_get_first(struct seq_file *seq)
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
 
-	st->offset = 0;
+	st->prev_idx = 0;
 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
 		struct inet_listen_hashbucket *ilb2;
 		struct hlist_nulls_node *node;
@@ -2637,7 +2637,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	struct sock *sk = cur;
 
 	++st->num;
-	++st->offset;
+	st->prev_idx = sk->sk_idx;
 
 	sk = sk_nulls_next(sk);
 	sk_nulls_for_each_from(sk, node) {
@@ -2658,7 +2658,6 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 	void *rc;
 
 	st->bucket = 0;
-	st->offset = 0;
 	rc = listening_get_first(seq);
 
 	while (rc && *pos) {
@@ -2683,7 +2682,7 @@ static void *established_get_first(struct seq_file *seq)
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
 
-	st->offset = 0;
+	st->prev_idx = 0;
 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
@@ -2714,7 +2713,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	struct sock *sk = cur;
 
 	++st->num;
-	++st->offset;
+	st->prev_idx = sk->sk_idx;
 
 	sk = sk_nulls_next(sk);
 
@@ -2763,8 +2762,8 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 {
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
+	__s64 prev_idx = st->prev_idx;
 	int bucket = st->bucket;
-	int offset = st->offset;
 	int orig_num = st->num;
 	void *rc = NULL;
 
@@ -2773,18 +2772,21 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 		if (st->bucket > hinfo->lhash2_mask)
 			break;
 		rc = listening_get_first(seq);
-		while (offset-- && rc && bucket == st->bucket)
+		while (rc && bucket == st->bucket && prev_idx &&
+		       ((struct sock *)rc)->sk_idx <= prev_idx)
 			rc = listening_get_next(seq, rc);
 		if (rc)
 			break;
 		st->bucket = 0;
+		prev_idx = 0;
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		fallthrough;
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (st->bucket > hinfo->ehash_mask)
 			break;
 		rc = established_get_first(seq);
-		while (offset-- && rc && bucket == st->bucket)
+		while (rc && bucket == st->bucket && prev_idx &&
+		       ((struct sock *)rc)->sk_idx <= prev_idx)
 			rc = established_get_next(seq, rc);
 	}
 
@@ -2807,7 +2809,7 @@ void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
 	st->bucket = 0;
-	st->offset = 0;
+	st->prev_idx = 0;
 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 
 out:
@@ -2832,7 +2834,7 @@ void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		if (!rc) {
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
 			st->bucket = 0;
-			st->offset = 0;
+			st->prev_idx = 0;
 			rc	  = established_get_first(seq);
 		}
 		break;
@@ -3124,7 +3126,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
 	 * it has to advance to the next bucket.
 	 */
 	if (iter->st_bucket_done) {
-		st->offset = 0;
+		st->prev_idx = 0;
 		st->bucket++;
 		if (st->state == TCP_SEQ_STATE_LISTENING &&
 		    st->bucket > hinfo->lhash2_mask) {
@@ -3192,8 +3194,9 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		 * the future start() will resume at st->offset in
 		 * st->bucket.  See tcp_seek_last_pos().
 		 */
-		st->offset++;
-		sock_gen_put(iter->batch[iter->cur_sk++]);
+		sk = iter->batch[iter->cur_sk++];
+		st->prev_idx = sk->sk_idx;
+		sock_gen_put(sk);
 	}
 
 	if (iter->cur_sk < iter->end_sk)
-- 
2.49.0.rc1.451.g8f38331e32-goog





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux