Re: [PATCH 2.6.21.5-rt17] IPV6: estalished connections are not shown with "cat /proc/net/tcp6"

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> On Sat, 2007-06-23 at 08:25 -0400, Steven Rostedt wrote:
> 
>> --- linux-2.6.21.orig/net/ipv4/tcp_ipv4.c       2007-06-17 17:19:02.000000000 +0200
>> +++ linux-2.6.21/net/ipv4/tcp_ipv4.c    2007-06-17 17:20:27.000000000 +0200
>> @@ -2033,7 +2033,12 @@ static void *established_get_first(struc
>>         struct tcp_iter_state* st = seq->private;
>>         void *rc = NULL;
>> 
>> -       for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
>> 
>> The above is a linear search through out a very large array, where most
>> of the items are NULL.  I believe it was Lee that noticed this creating
>> a large latency. This was back in 2.6.14. I'll check to see if this
>> still is a source of latency with the latest kernels.
> 
> It looks fairly generic, is it a latency that could be fix the same way
> up stream?
> 
> Daniel
>
Hi Steven,

I believe that we should push the following patch to upstream(netdev).
(This is a patch which extracted a part of the hash code from real-time patch.)
How do you think?

Masayuki Nakagawa

Index: linux-2.6/include/net/inet_hashtables.h
===================================================================
--- linux-2.6.orig/include/net/inet_hashtables.h
+++ linux-2.6/include/net/inet_hashtables.h
@@ -101,6 +101,7 @@ struct inet_hashinfo {
 	 * TIME_WAIT sockets use a separate chain (twchain).
 	 */
 	struct inet_ehash_bucket	*ehash;
+	unsigned long			*ebitmask;

 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -135,6 +136,12 @@ static inline struct inet_ehash_bucket *
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }

+static inline unsigned int inet_ehash_index(
+	struct inet_hashinfo *hashinfo, unsigned int hash)
+{
+	return hash & (hashinfo->ehash_size - 1);
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(struct kmem_cache *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -207,11 +214,27 @@ static inline void inet_listen_unlock(st
 		wake_up(&hashinfo->lhash_wait);
 }

+static inline void __inet_hash_setbit(unsigned long *bitmask,
+	unsigned int index)
+{
+	if (bitmask)
+		set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask,
+	unsigned int index)
+{
+	if (bitmask)
+		clear_bit(index, bitmask);
+}
+
 static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 			       struct sock *sk, const int listen_possible)
 {
 	struct hlist_head *list;
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;

 	BUG_TRAP(sk_unhashed(sk));
 	if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -221,12 +244,15 @@ static inline void __inet_hash(struct in
 	} else {
 		struct inet_ehash_bucket *head;
 		sk->sk_hash = inet_sk_ehashfn(sk);
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
 		lock = &head->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
+	__inet_hash_setbit(bitmask, index);
 	sock_prot_inc_use(sk->sk_prot);
 	write_unlock(lock);
 	if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -245,6 +271,8 @@ static inline void inet_hash(struct inet
 static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 {
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;

 	if (sk_unhashed(sk))
 		goto out;
@@ -254,12 +282,16 @@ static inline void inet_unhash(struct in
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock_bh(lock);
 	}

-	if (__sk_del_node_init(sk))
+	if (__sk_del_node_init(sk)) {
+		__inet_hash_clearbit(bitmask, index);
 		sock_prot_dec_use(sk->sk_prot);
+	}
 	write_unlock_bh(lock);
 out:
 	if (sk->sk_state == TCP_LISTEN)
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -3919,6 +3919,33 @@ void *__init alloc_large_system_hash(con
 	return table;
 }

+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags)
+{
+	unsigned long words = bits / (sizeof(unsigned long)*8);
+	unsigned long size = words * sizeof(unsigned long);
+	unsigned long *bitmask = NULL;
+
+	if (flags & HASH_EARLY)
+		bitmask = alloc_bootmem(size);
+	else if (hashdist)
+		bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+	else {
+		bitmask = kmalloc(size, GFP_ATOMIC);
+		if (!bitmask) {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	}
+
+	if (!bitmask)
+		panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+	return bitmask;
+}
+
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
Index: linux-2.6/net/ipv4/tcp.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp.c
+++ linux-2.6/net/ipv4/tcp.c
@@ -2418,6 +2418,9 @@ static int __init set_thash_entries(char
 }
 __setup("thash_entries=", set_thash_entries);

+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags);
+
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
@@ -2449,6 +2452,10 @@ void __init tcp_init(void)
 					NULL,
 					0);
 	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
+	tcp_hashinfo.ebitmask =
+		alloc_large_system_bitmask("TCP established",
+					tcp_hashinfo.ehash_size,
+					0);
 	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Index: linux-2.6/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp_ipv4.c
+++ linux-2.6/net/ipv4/tcp_ipv4.c
@@ -2040,7 +2040,12 @@ static void *established_get_first(struc
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;

-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+					tcp_hashinfo.ehash_size);
+		st->bucket < tcp_hashinfo.ehash_size;
+		st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+					tcp_hashinfo.ehash_size,
+					st->bucket+1)) {
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
Index: linux-2.6/net/ipv6/inet6_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv6/inet6_hashtables.c
+++ linux-2.6/net/ipv6/inet6_hashtables.c
@@ -27,6 +27,8 @@ void __inet6_hash(struct inet_hashinfo *
 {
 	struct hlist_head *list;
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;

 	BUG_TRAP(sk_unhashed(sk));

@@ -35,15 +37,16 @@ void __inet6_hash(struct inet_hashinfo *
 		lock = &hashinfo->lhash_lock;
 		inet_listen_wlock(hashinfo);
 	} else {
-		unsigned int hash;
-		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
-		hash &= (hashinfo->ehash_size - 1);
-		list = &hashinfo->ehash[hash].chain;
-		lock = &hashinfo->ehash[hash].lock;
+		sk->sk_hash = inet6_sk_ehashfn(sk);
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
+		list = &hashinfo->ehash[index].chain;
+		lock = &hashinfo->ehash[index].lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock(lock);
 	}

 	__sk_add_node(sk, list);
+	__inet_hash_setbit(bitmask, index);
 	sock_prot_inc_use(sk->sk_prot);
 	write_unlock(lock);
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [RT Stable]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]

  Powered by Linux