> On Sat, 2007-06-23 at 08:25 -0400, Steven Rostedt wrote: > >> --- linux-2.6.21.orig/net/ipv4/tcp_ipv4.c 2007-06-17 17:19:02.000000000 +0200 >> +++ linux-2.6.21/net/ipv4/tcp_ipv4.c 2007-06-17 17:20:27.000000000 +0200 >> @@ -2033,7 +2033,12 @@ static void *established_get_first(struc >> struct tcp_iter_state* st = seq->private; >> void *rc = NULL; >> >> - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { >> >> The above is a linear search through out a very large array, where most >> of the items are NULL. I believe it was Lee that noticed this creating >> a large latency. This was back in 2.6.14. I'll check to see if this >> still is a source of latency with the latest kernels. > > It looks fairly generic, is it a latency that could be fix the same way > up stream? > > Daniel > Hi Steven, I believe that we should push the following patch to upstream(netdev). (This is a patch which extracted a part of the hash code from real-time patch.) How do you think? Masayuki Nakagawa Index: linux-2.6/include/net/inet_hashtables.h =================================================================== --- linux-2.6.orig/include/net/inet_hashtables.h +++ linux-2.6/include/net/inet_hashtables.h @@ -101,6 +101,7 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; + unsigned long *ebitmask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. @@ -135,6 +136,12 @@ static inline struct inet_ehash_bucket * return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } +static inline unsigned int inet_ehash_index( + struct inet_hashinfo *hashinfo, unsigned int hash) +{ + return hash & (hashinfo->ehash_size - 1); +} + extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, @@ -207,11 +214,27 @@ static inline void inet_listen_unlock(st wake_up(&hashinfo->lhash_wait); } +static inline void __inet_hash_setbit(unsigned long *bitmask, + unsigned int index) +{ + if (bitmask) + set_bit(index, bitmask); +} + +static inline void __inet_hash_clearbit(unsigned long *bitmask, + unsigned int index) +{ + if (bitmask) + clear_bit(index, bitmask); +} + static inline void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk, const int listen_possible) { struct hlist_head *list; rwlock_t *lock; + unsigned long *bitmask = NULL; + unsigned int index = 0; BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { @@ -221,12 +244,15 @@ static inline void __inet_hash(struct in } else { struct inet_ehash_bucket *head; sk->sk_hash = inet_sk_ehashfn(sk); + index = inet_ehash_index(hashinfo, sk->sk_hash); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = &head->lock; + bitmask = hashinfo->ebitmask; write_lock(lock); } __sk_add_node(sk, list); + __inet_hash_setbit(bitmask, index); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); if (listen_possible && sk->sk_state == TCP_LISTEN) @@ -245,6 +271,8 @@ static inline void inet_hash(struct inet static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) { rwlock_t *lock; + unsigned long *bitmask = NULL; + unsigned int index = 0; if (sk_unhashed(sk)) goto out; @@ -254,12 +282,16 @@ static inline void inet_unhash(struct in inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { + index = inet_ehash_index(hashinfo, sk->sk_hash); lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + bitmask = hashinfo->ebitmask; write_lock_bh(lock); } - if (__sk_del_node_init(sk)) + if (__sk_del_node_init(sk)) { + __inet_hash_clearbit(bitmask, index); sock_prot_dec_use(sk->sk_prot); + } write_unlock_bh(lock); out: if (sk->sk_state == TCP_LISTEN) Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -3919,6 +3919,33 @@ void *__init alloc_large_system_hash(con return table; } +void *__init alloc_large_system_bitmask(char *bitmaskname, + unsigned long bits, int flags) +{ + unsigned long words = bits / (sizeof(unsigned long)*8); + unsigned long size = words * sizeof(unsigned long); + unsigned long *bitmask = NULL; + + if (flags & HASH_EARLY) + bitmask = alloc_bootmem(size); + else if (hashdist) + bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + bitmask = kmalloc(size, GFP_ATOMIC); + if (!bitmask) { + unsigned long order; + for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) + ; + bitmask = (void*) __get_free_pages(GFP_ATOMIC, order); + } + } + + if (!bitmask) + panic("Failed to allocate %s bitmask\n", bitmaskname); + + return bitmask; +} + #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE struct page *pfn_to_page(unsigned long pfn) { Index: linux-2.6/net/ipv4/tcp.c =================================================================== --- linux-2.6.orig/net/ipv4/tcp.c +++ linux-2.6/net/ipv4/tcp.c @@ -2418,6 +2418,9 @@ static int __init set_thash_entries(char } __setup("thash_entries=", set_thash_entries); +void *__init alloc_large_system_bitmask(char *bitmaskname, + unsigned long bits, int flags); + void __init tcp_init(void) { struct sk_buff *skb = NULL; @@ -2449,6 +2452,10 @@ void __init tcp_init(void) NULL, 0); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; + tcp_hashinfo.ebitmask = + alloc_large_system_bitmask("TCP established", + tcp_hashinfo.ehash_size, + 0); for (i = 0; i < tcp_hashinfo.ehash_size; i++) { rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); Index: linux-2.6/net/ipv4/tcp_ipv4.c =================================================================== --- linux-2.6.orig/net/ipv4/tcp_ipv4.c +++ linux-2.6/net/ipv4/tcp_ipv4.c @@ -2040,7 +2040,12 @@ static void *established_get_first(struc struct tcp_iter_state* st = seq->private; void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask, + tcp_hashinfo.ehash_size); + st->bucket < tcp_hashinfo.ehash_size; + st->bucket = find_next_bit(tcp_hashinfo.ebitmask, + tcp_hashinfo.ehash_size, + st->bucket+1)) { struct sock *sk; struct hlist_node *node; struct inet_timewait_sock *tw; Index: linux-2.6/net/ipv6/inet6_hashtables.c =================================================================== --- linux-2.6.orig/net/ipv6/inet6_hashtables.c +++ linux-2.6/net/ipv6/inet6_hashtables.c @@ -27,6 +27,8 @@ void __inet6_hash(struct inet_hashinfo * { struct hlist_head *list; rwlock_t *lock; + unsigned long *bitmask = NULL; + unsigned int index = 0; BUG_TRAP(sk_unhashed(sk)); @@ -35,15 +37,16 @@ void __inet6_hash(struct inet_hashinfo * lock = &hashinfo->lhash_lock; inet_listen_wlock(hashinfo); } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (hashinfo->ehash_size - 1); - list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + sk->sk_hash = inet6_sk_ehashfn(sk); + index = inet_ehash_index(hashinfo, sk->sk_hash); + list = &hashinfo->ehash[index].chain; + lock = &hashinfo->ehash[index].lock; + bitmask = hashinfo->ebitmask; write_lock(lock); } __sk_add_node(sk, list); + __inet_hash_setbit(bitmask, index); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); } - To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html