David S. Miller wrote:
Posting a patch without describing why the change is beingSorry, i didn't meant to waste your time.
made is always a bad idea. I delete such emails. I'm stupid
and I need to be told what a patch does and why.
-
The current ip_fragment evictor kills the oldest entry of each hash bucket
starting with 0 instead of killing the oldest entry of all buckets. This leads
to unfair behaviour if one of the higher hash slots carries alot of fragments.
This patch holds the frag heads in a lru queue so we can kills the least
recently used first. Each arriving fragment counts as usage.
Patrick
diff -urN linux-2.4.20-clean/net/ipv4/ip_fragment.c linux-2.4.20/net/ipv4/ip_fragment.c --- linux-2.4.20-clean/net/ipv4/ip_fragment.c 2002-02-25 20:38:14.000000000 +0100 +++ linux-2.4.20/net/ipv4/ip_fragment.c 2003-02-16 05:28:17.000000000 +0100 @@ -19,6 +19,7 @@ * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. * Alexey Kuznetsov: SMP races, threading, cleanup. + * Patrick McHardy : LRU queue of frag heads for evictor. */ #include <linux/config.h> @@ -26,6 +27,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/skbuff.h> +#include <linux/list.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> @@ -67,6 +69,7 @@ /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct ipq *next; /* linked list pointers */ + struct list_head lru_list; /* lru list member */ u32 saddr; u32 daddr; u16 id; @@ -94,6 +97,7 @@ /* Per-bucket lock is easy to add now. */ static struct ipq *ipq_hash[IPQ_HASHSZ]; static rwlock_t ipfrag_lock = RW_LOCK_UNLOCKED; +static LIST_HEAD(ipq_lru_list); int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) @@ -101,6 +105,7 @@ if(qp->next) qp->next->pprev = qp->pprev; *qp->pprev = qp->next; + list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -202,39 +207,30 @@ */ static void ip_evictor(void) { - int i, progress; + struct ipq *qp; + struct list_head *tmp; - do { + for(;;) { if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) return; - progress = 0; - /* FIXME: Make LRU queue of frag heads. -DaveM */ - for (i = 0; i < IPQ_HASHSZ; i++) { - struct ipq *qp; - if (ipq_hash[i] == NULL) - continue; - - read_lock(&ipfrag_lock); - if ((qp = ipq_hash[i]) != NULL) { - /* find the oldest queue for this hash bucket */ - while (qp->next) - qp = qp->next; - atomic_inc(&qp->refcnt); - read_unlock(&ipfrag_lock); - - spin_lock(&qp->lock); - if (!(qp->last_in&COMPLETE)) - ipq_kill(qp); - spin_unlock(&qp->lock); - - ipq_put(qp); - IP_INC_STATS_BH(IpReasmFails); - progress = 1; - continue; - } + read_lock(&ipfrag_lock); + if (list_empty(&ipq_lru_list)) { read_unlock(&ipfrag_lock); + return; } - } while (progress); + tmp = ipq_lru_list.next; + qp = list_entry(tmp, struct ipq, lru_list); + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (!(qp->last_in&COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp); + IP_INC_STATS_BH(IpReasmFails); + } } /* @@ -302,6 +298,8 @@ qp->next->pprev = &qp->next; ipq_hash[hash] = qp; qp->pprev = &ipq_hash[hash]; + INIT_LIST_HEAD(&qp->lru_list); + list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; write_unlock(&ipfrag_lock); return qp; @@ -496,6 +494,10 @@ if (offset == 0) qp->last_in |= FIRST_IN; + write_lock(&ipfrag_lock); + list_move_tail(&qp->lru_list, &ipq_lru_list); + write_unlock(&ipfrag_lock); + return; err: diff -urN ../ppp_filter/linux/linux-2.4.20-clean/net/ipv6/reassembly.c linux-2.4.20/net/ipv6/reassembly.c --- ../ppp_filter/linux/linux-2.4.20-clean/net/ipv6/reassembly.c 2002-11-29 00:53:15.000000000 +0100 +++ linux-2.4.20/net/ipv6/reassembly.c 2003-02-16 05:39:50.000000000 +0100 @@ -22,6 +22,7 @@ * * Horst von Brand Add missing #include <linux/string.h> * Alexey Kuznetsov SMP races, threading, cleanup. + * Patrick McHardy LRU queue of frag heads for evictor. */ #include <linux/config.h> #include <linux/errno.h> @@ -30,6 +31,7 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> +#include <linux/list.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> @@ -67,6 +69,7 @@ struct frag_queue { struct frag_queue *next; + struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ struct in6_addr saddr; @@ -95,6 +98,7 @@ static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; static rwlock_t ip6_frag_lock = RW_LOCK_UNLOCKED; +static LIST_HEAD(ip6_frag_lru_list); int ip6_frag_nqueues = 0; static __inline__ void __fq_unlink(struct frag_queue *fq) @@ -102,6 +106,7 @@ if(fq->next) fq->next->pprev = fq->pprev; *fq->pprev = fq->next; + list_del(&fq->lru_list); ip6_frag_nqueues--; } @@ -193,38 +198,30 @@ static void ip6_evictor(void) { - int i, progress; + struct frag_queue *fq; + struct list_head *tmp; - do { + for(;;) { if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) return; - progress = 0; - for (i = 0; i < IP6Q_HASHSZ; i++) { - struct frag_queue *fq; - if (ip6_frag_hash[i] == NULL) - continue; - - read_lock(&ip6_frag_lock); - if ((fq = ip6_frag_hash[i]) != NULL) { - /* find the oldest queue for this hash bucket */ - while (fq->next) - fq = fq->next; - atomic_inc(&fq->refcnt); - read_unlock(&ip6_frag_lock); - - spin_lock(&fq->lock); - if (!(fq->last_in&COMPLETE)) - fq_kill(fq); - spin_unlock(&fq->lock); - - fq_put(fq); - IP6_INC_STATS_BH(Ip6ReasmFails); - progress = 1; - continue; - } + read_lock(&ip6_frag_lock); + if (list_empty(&ip6_frag_lru_list)) { read_unlock(&ip6_frag_lock); + return; } - } while (progress); + tmp = ip6_frag_lru_list.next; + fq = list_entry(tmp, struct frag_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq); + IP6_INC_STATS_BH(Ip6ReasmFails); + } } static void ip6_frag_expire(unsigned long data) @@ -294,6 +291,8 @@ fq->next->pprev = &fq->next; ip6_frag_hash[hash] = fq; fq->pprev = &ip6_frag_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &ip6_frag_lru_list); ip6_frag_nqueues++; write_unlock(&ip6_frag_lock); return fq; @@ -501,6 +500,9 @@ fq->nhoffset = nhoff; fq->last_in |= FIRST_IN; } + write_lock(&ip6_frag_lock); + list_move_tail(&fq->lru_list, &ip6_frag_lru_list); + write_unlock(&ip6_frag_lock); return; err: