Re: debugging kernel during packet drops

Eric Dumazet <eric.dumazet@xxxxxxxxx> · Fri, 26 Mar 2010 12:21:48 +0100

Le vendredi 26 mars 2010 à 11:41 +0100, Jorrit Kronjee a écrit :
>    
> Eric,
> 

Hi Jorrit

> Awesome work, thanks for the effort! I've tried the patch and got some
> results. The drop rate was reduced dramatically after I activated RPS.
> 
> I did the same test I did before, namely I rebooted and started flooding
> the machine immediately after with 300 kpps. After 5 minutes, perf top
> looked like this:
> 
> -------------------------------------------------------------------------------------------------------------------------
>    PerfTop:    1962 irqs/sec  kernel:99.3% [1000Hz cycles],  (all, 4 CPUs)
> -------------------------------------------------------------------------------------------------------------------------
> 

we can see cpu power is better used now


>              samples  pcnt function                 DSO
>              _______ _____ ________________________
> _____________________________________________________________________
> 
>              4501.00 14.0% __ticket_spin_lock      
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux

strange... it could be goot to try a callgraph report to check where we
spend so much time in spinlock

>              2985.00  9.3% dsthash_find            
> /lib/modules/2.6.34-rc1-net-next/kernel/net/netfilter/xt_hashlimit.ko
>              2346.00  7.3% __ticket_spin_unlock    
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>              1354.00  4.2% e1000_xmit_frame        
> /lib/modules/2.6.34-rc1-net-next/kernel/drivers/net/e1000e/e1000e.ko
>              1070.00  3.3% __slab_free             
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               997.00  3.1% memcpy                  
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               809.00  2.5% dev_queue_xmit          
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               791.00  2.5% nf_iterate              
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               705.00  2.2% e1000_clean_tx_irq      
> /lib/modules/2.6.34-rc1-net-next/kernel/drivers/net/e1000e/e1000e.ko
>               634.00  2.0% nf_hook_slow            
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               624.00  1.9% skb_release_head_state  
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               584.00  1.8% e1000_intr              
> /lib/modules/2.6.34-rc1-net-next/kernel/drivers/net/e1000/e1000.ko
>               536.00  1.7% br_nf_pre_routing_finish
> /lib/modules/2.6.34-rc1-net-next/kernel/net/bridge/bridge.ko
>               528.00  1.6% nommu_map_page          
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               499.00  1.6% kfree                   
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               494.00  1.5% __netif_receive_skb     
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               472.00  1.5% __alloc_skb             
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               448.00  1.4% br_fdb_update           
> /lib/modules/2.6.34-rc1-net-next/kernel/net/bridge/bridge.ko
>               437.00  1.4% __slab_alloc            
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               428.00  1.3% ipt_do_table             [ip_tables]
>               403.00  1.3% memset                  
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               402.00  1.3% br_handle_frame         
> /lib/modules/2.6.34-rc1-net-next/kernel/net/bridge/bridge.ko
>               389.00  1.2% e1000_clean_rx_irq      
> /lib/modules/2.6.34-rc1-net-next/kernel/drivers/net/e1000/e1000.ko
>               388.00  1.2% e1000_clean             
> /lib/modules/2.6.34-rc1-net-next/kernel/drivers/net/e1000/e1000.ko
>               381.00  1.2% uhci_irq                
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               366.00  1.1% get_rps_cpu             
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
>               365.00  1.1% br_nf_pre_routing       
> /lib/modules/2.6.34-rc1-net-next/kernel/net/bridge/bridge.ko
>               349.00  1.1% dst_release             
> /lib/modules/2.6.34-rc1-net-next/build/vmlinux
> 
> And iptables-save -c produced this:
> # Generated by iptables-save v1.4.4 on Fri Mar 26 11:24:59 2010
> *filter
> :INPUT ACCEPT [1043:60514]
> :FORWARD ACCEPT [0:0]
> :OUTPUT ACCEPT [942:282723]
> [99563191:3783420610] -A FORWARD -m hashlimit --hashlimit-upto 10000/sec
> --hashlimit-burst 100 --hashlimit-mode dstip --hashlimit-name hashtable
> --hashlimit-htable-max 131072 --hashlimit-htable-expire 1000 -j ACCEPT

Hmm, maybe you should add --hashlimit-htable-size 32768  (or 65536)

Because the default is probably too small for your case.

> [0:0] -A FORWARD -m limit --limit 5/sec -j LOG --log-prefix "HASHLIMITED
> -- "
> [0:0] -A FORWARD -j DROP
> COMMIT
> # Completed on Fri Mar 26 11:24:59 2010
> 
> And /proc/interrupts looked like this:
>      CPU0       CPU1       CPU2       CPU3
>   0:         47          0          1          0   IO-APIC-edge      timer
>   1:          0          1          0          1   IO-APIC-edge      i8042
>   6:          1          1          0          0   IO-APIC-edge      floppy
>   8:          1          0          0          0   IO-APIC-edge      rtc0
>   9:          0          0          0          0   IO-APIC-fasteoi   acpi
>  12:          0          1          1          2   IO-APIC-edge      i8042
>  14:         21         22         22         21   IO-APIC-edge     
> ata_piix
>  15:          0          0          0          0   IO-APIC-edge     
> ata_piix
>  16:        492        464        463        474   IO-APIC-fasteoi   arcmsr
>  17:          0          0          0          0   IO-APIC-fasteoi  
> ehci_hcd:usb1
>  18:     971171     971391     948171     948663   IO-APIC-fasteoi  
> uhci_hcd:usb3, uhci_hcd:usb7, eth3
>  19:          0          0          0          0   IO-APIC-fasteoi  
> uhci_hcd:usb6
>  21:          0          0          0          0   IO-APIC-fasteoi  
> ata_piix, uhci_hcd:usb4
>  23:          1          0          1          0   IO-APIC-fasteoi  
> ehci_hcd:usb2, uhci_hcd:usb5
>  27:    1003145    1002952    1026174    1025671   PCI-MSI-edge      eth4
> NMI:     202553     185135     134999     185071   Non-maskable interrupts
> LOC:      20270      19227      17387      23282   Local timer interrupts
> SPU:          0          0          0          0   Spurious interrupts
> PMI:     202553     185135     134999     185071   Performance
> monitoring interrupts
> PND:     201464     183939     134067     184098   Performance pending work
> RES:       2216       2449       1212       1432   Rescheduling interrupts
> CAL:    2223380    2226493    2233481    2228957   Function call interrupts
> TLB:        606        584       1274       1216   TLB shootdowns
> TRM:          0          0          0          0   Thermal event interrupts
> THR:          0          0          0          0   Threshold APIC interrupts
> MCE:          0          0          0          0   Machine check exceptions
> MCP:          2          2          2          2   Machine check polls
> ERR:          3
> MIS:          0
> 
> ifconfig reported only 2 drops after these 5 minutes. I'm thinking about
> removing/changing the hashing algorithm to make dsthash_find faster. All
> I need after all is a match against a destination IP address. Also, I'd
> like the limit of 10kpps to be a bit higher. I'll see if I can work on
> that during the weekend.
> 

Nice !

I have another patch version (but not yet tested) that now is able to
make dynamic key, so that its length is shortened. In your case, only
dst is matched, key length is one u32.

So time to build key is short, time to compare is also short (not using
slow memcmp())

Its probably buggy as hell, dont even try it !

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9e9c489..09211fa 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -41,6 +41,54 @@ MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match");
 MODULE_ALIAS("ipt_hashlimit");
 MODULE_ALIAS("ip6t_hashlimit");
 
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maximum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+}
+
 struct hashlimit_net {
 	struct hlist_head	htables;
 	struct proc_dir_entry	*ipt_hashlimit;
@@ -56,36 +104,23 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops;
 
-/* hash table crap */
-struct dsthash_dst {
-	union {
-		struct {
-			__be32 src;
-			__be32 dst;
-		} ip;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-		struct {
-			__be32 src[4];
-			__be32 dst[4];
-		} ip6;
+#define MAXKEYSIZE (4 + 4 + 1)
+#else
+#define MAXKEYSIZE (1 + 1 + 1)
 #endif
-	};
-	__be16 src_port;
-	__be16 dst_port;
-};
 
 struct dsthash_ent {
-	/* static / read-only parts in the beginning */
-	struct hlist_node node;
-	struct dsthash_dst dst;
-
-	/* modified structure members in the end */
+	struct rcu_head rcu;
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
-		u_int32_t credit_cap, cost;
+		u_int32_t cost;
 	} rateinfo;
+	struct hlist_node node;
+	u32    key[MAXKEYSIZE];
 };
 
 struct xt_hashlimit_htable {
@@ -95,10 +130,13 @@ struct xt_hashlimit_htable {
 	bool rnd_initialized;
 
 	struct hashlimit_cfg1 cfg;	/* config */
-
+	u_int32_t credit_cap;
+	__be32    ndstmask;
+	__be32    nsrcmask;
 	/* used internally */
-	spinlock_t lock;		/* lock for list_head */
 	u_int32_t rnd;			/* random seed for hash */
+	unsigned int sz32;		/* number of u32 in key[0] */
+	spinlock_t hlock;		/* lock for writes to hash[]/count/rnd */
 	unsigned int count;		/* number entries in table */
 	struct timer_list timer;	/* timer for gc */
 
@@ -112,18 +150,19 @@ struct xt_hashlimit_htable {
 static DEFINE_MUTEX(hashlimit_mutex);	/* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
-static inline bool dst_cmp(const struct dsthash_ent *ent,
-			   const struct dsthash_dst *b)
+static bool key_cmp(const struct dsthash_ent *ent, const u32 *key,
+		    unsigned int sz32)
 {
-	return !memcmp(&ent->dst, b, sizeof(ent->dst));
+	unsigned int i;
+	u32 res = 0;
+	for (i = 0; i < sz32; i++)
+		res |= (ent->key[i] ^ key[i]);
+	return res == 0;
 }
 
-static u_int32_t
-hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+static u32 hash_func(const struct xt_hashlimit_htable *ht, const u32 *key)
 {
-	u_int32_t hash = jhash2((const u32 *)dst,
-				sizeof(*dst)/sizeof(u32),
-				ht->rnd);
+	u32 hash = jhash2(key, ht->sz32, ht->rnd);
 	/*
 	 * Instead of returning hash % ht->cfg.size (implying a divide)
 	 * we return the high 32 bits of the (hash * ht->cfg.size) that will
@@ -135,62 +174,32 @@ hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst)
 
 static struct dsthash_ent *
 dsthash_find(const struct xt_hashlimit_htable *ht,
-	     const struct dsthash_dst *dst)
+	     const u32 *key, u32 hash)
 {
 	struct dsthash_ent *ent;
 	struct hlist_node *pos;
-	u_int32_t hash = hash_dst(ht, dst);
 
-	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
-				return ent;
-	}
+	hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+		if (key_cmp(ent, key, ht->sz32)) {
+			spin_lock(&ent->lock);
+			return ent;
+		}
+
 	return NULL;
 }
 
-/* allocate dsthash_ent, initialize dst, put in htable and lock it */
-static struct dsthash_ent *
-dsthash_alloc_init(struct xt_hashlimit_htable *ht,
-		   const struct dsthash_dst *dst)
-{
-	struct dsthash_ent *ent;
-
-	/* initialize hash with random val at the time we allocate
-	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
-		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
-		ht->rnd_initialized = true;
-	}
-
-	if (ht->cfg.max && ht->count >= ht->cfg.max) {
-		/* FIXME: do something. question is what.. */
-		if (net_ratelimit())
-			printk(KERN_WARNING
-				"xt_hashlimit: max count of %u reached\n",
-				ht->cfg.max);
-		return NULL;
-	}
 
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
-	if (!ent) {
-		if (net_ratelimit())
-			printk(KERN_ERR
-				"xt_hashlimit: can't allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
-	return ent;
+	kmem_cache_free(hashlimit_cachep, ent);
 }
 
-static inline void
-dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+static void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -216,7 +225,7 @@ static int htable_create_v0(struct net *net, struct xt_hashlimit_info *minfo, u_
 	hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
 			sizeof(struct list_head) * size);
 	if (!hinfo) {
-		printk(KERN_ERR "xt_hashlimit: unable to create hashtable\n");
+		pr_err("xt_hashlimit: unable to create hashtable\n");
 		return -1;
 	}
 	minfo->hinfo = hinfo;
@@ -229,9 +238,10 @@ static int htable_create_v0(struct net *net, struct xt_hashlimit_info *minfo, u_
 	hinfo->cfg.gc_interval = minfo->cfg.gc_interval;
 	hinfo->cfg.expire      = minfo->cfg.expire;
 
-	if (family == NFPROTO_IPV4)
+	if (family == NFPROTO_IPV4) {
 		hinfo->cfg.srcmask = hinfo->cfg.dstmask = 32;
-	else
+		hinfo->nsrcmask = hinfo->ndstmask = 0xFFFFFFFF;
+	} else
 		hinfo->cfg.srcmask = hinfo->cfg.dstmask = 128;
 
 	hinfo->cfg.size = size;
@@ -243,11 +253,16 @@ static int htable_create_v0(struct net *net, struct xt_hashlimit_info *minfo, u_
 	for (i = 0; i < hinfo->cfg.size; i++)
 		INIT_HLIST_HEAD(&hinfo->hash[i]);
 
+	hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
 	hinfo->use = 1;
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	spin_lock_init(&hinfo->lock);
+
+	hinfo->sz32 = (family == NFPROTO_IPV4) ? (1 + 1) : (4 + 4); 
+	if (hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))
+		hinfo->sz32++;
+	spin_lock_init(&hinfo->hlock);
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
@@ -289,7 +304,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
 	                sizeof(struct list_head) * size);
 	if (hinfo == NULL) {
-		printk(KERN_ERR "xt_hashlimit: unable to create hashtable\n");
+		pr_err("xt_hashlimit: unable to create hashtable\n");
 		return -1;
 	}
 	minfo->hinfo = hinfo;
@@ -302,14 +317,22 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	else if (hinfo->cfg.max < hinfo->cfg.size)
 		hinfo->cfg.max = hinfo->cfg.size;
 
+	if (family == NFPROTO_IPV4) {
+		hinfo->nsrcmask = htonl(~0 << (32 - hinfo->cfg.srcmask));
+		hinfo->ndstmask = htonl(~0 << (32 - hinfo->cfg.dstmask));
+	}
 	for (i = 0; i < hinfo->cfg.size; i++)
 		INIT_HLIST_HEAD(&hinfo->hash[i]);
 
+	hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
 	hinfo->use = 1;
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	spin_lock_init(&hinfo->lock);
+	hinfo->sz32 = (family == NFPROTO_IPV4) ? (1 + 1) : (4 + 4); 
+	if (hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))
+		hinfo->sz32++;
+	spin_lock_init(&hinfo->hlock);
 
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
@@ -349,7 +372,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
 	unsigned int i;
 
 	/* lock hash table and iterate over it */
-	spin_lock_bh(&ht->lock);
+	spin_lock_bh(&ht->hlock);
 	for (i = 0; i < ht->cfg.size; i++) {
 		struct dsthash_ent *dh;
 		struct hlist_node *pos, *n;
@@ -358,7 +381,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
 				dsthash_free(ht, dh);
 		}
 	}
-	spin_unlock_bh(&ht->lock);
+	spin_unlock_bh(&ht->hlock);
 }
 
 /* hash table garbage collector, run by timer */
@@ -417,109 +440,54 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 	mutex_unlock(&hashlimit_mutex);
 }
 
-/* The algorithm used is the Simple Token Bucket Filter (TBF)
- * see net/sched/sch_tbf.c in the linux source tree
- */
-
-/* Rusty: This is my (non-mathematically-inclined) understanding of
-   this algorithm.  The `average rate' in jiffies becomes your initial
-   amount of credit `credit' and the most credit you can ever have
-   `credit_cap'.  The `peak rate' becomes the cost of passing the
-   test, `cost'.
-
-   `prev' tracks the last packet hit: you gain one credit per jiffy.
-   If you get credit balance more than this, the extra credit is
-   discarded.  Every time the match passes, you lose `cost' credits;
-   if you don't have that many, the test fails.
-
-   See Alexey's formal explanation in net/sched/sch_tbf.c.
-
-   To get the maximum range, we multiply by this factor (ie. you get N
-   credits per jiffy).  We want to allow a rate as low as 1 per day
-   (slowest userspace tool allows), which means
-   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
-*/
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
-
-/* Repeated shift and or gives us all 1s, final shift and add 1 gives
- * us the power of 2 below the theoretical max, so GCC simply does a
- * shift. */
-#define _POW2_BELOW2(x) ((x)|((x)>>1))
-#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
-#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
-#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
-#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
-#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
-
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
-
-/* Precision saver. */
-static inline u_int32_t
-user2credits(u_int32_t user)
-{
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
-		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
-
-	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
-}
-
-static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+				   struct xt_hashlimit_htable *hinfo)
 {
 	dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
-	if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
-		dh->rateinfo.credit = dh->rateinfo.credit_cap;
+	if (dh->rateinfo.credit > hinfo->credit_cap)
+		dh->rateinfo.credit = hinfo->credit_cap;
 	dh->rateinfo.prev = now;
 }
 
-static inline __be32 maskl(__be32 a, unsigned int l)
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+static u32 *ipv6_addr_partial_copy(u32 *key, __be32 *addr,
+				   unsigned int leading_words, __be32 lastmask)
 {
-	return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
+	switch (leading_words) {
+	case 3: *key++ = *addr++;
+	case 2: *key++ = *addr++;
+	case 1: *key++ = *addr++;
+	}
+	*key++ = lastmask & *addr;
+	return key;
 }
 
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
+static const u32 *ipv6_addr_rebuild(const u32 *key, __be32 *addr, unsigned int leading_words)
 {
-	switch (p) {
-	case 0 ... 31:
-		i[0] = maskl(i[0], p);
-		i[1] = i[2] = i[3] = 0;
-		break;
-	case 32 ... 63:
-		i[1] = maskl(i[1], p - 32);
-		i[2] = i[3] = 0;
-		break;
-	case 64 ... 95:
-		i[2] = maskl(i[2], p - 64);
-		i[3] = 0;
-	case 96 ... 127:
-		i[3] = maskl(i[3], p - 96);
-		break;
-	case 128:
-		break;
+	memset(addr, 0, 16);
+
+	while (leading_words) {
+		*addr++ = *key++;
+		leading_words--;
 	}
+	*addr = *key++;
+	return key;
 }
 #endif
 
 static int
-hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
-		   struct dsthash_dst *dst,
+hashlimit_init_key(const struct xt_hashlimit_htable *hinfo, u32 *key,
 		   const struct sk_buff *skb, unsigned int protoff)
 {
 	__be16 _ports[2], *ports;
 	u8 nexthdr;
 
-	memset(dst, 0, sizeof(*dst));
-
 	switch (hinfo->family) {
 	case NFPROTO_IPV4:
 		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP)
-			dst->ip.dst = maskl(ip_hdr(skb)->daddr,
-			              hinfo->cfg.dstmask);
+			*key++ = ip_hdr(skb)->daddr & hinfo->ndstmask;
 		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP)
-			dst->ip.src = maskl(ip_hdr(skb)->saddr,
-			              hinfo->cfg.srcmask);
+			*key++ = ip_hdr(skb)->saddr & hinfo->nsrcmask;
 
 		if (!(hinfo->cfg.mode &
 		      (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
@@ -528,16 +496,14 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {
-			memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,
-			       sizeof(dst->ip6.dst));
-			hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask);
-		}
-		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) {
-			memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr,
-			       sizeof(dst->ip6.src));
-			hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask);
-		}
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP)
+			key = ipv6_addr_partial_copy(key, ipv6_hdr(skb)->daddr.s6_addr32,
+						     hinfo->cfg.dstmask >> 5,
+						     hinfo->ndstmask);
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP)
+			key = ipv6_addr_partial_copy(key, ipv6_hdr(skb)->saddr.s6_addr32,
+						     hinfo->cfg.srcmask >> 5,
+						     hinfo->nsrcmask);
 
 		if (!(hinfo->cfg.mode &
 		      (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
@@ -563,19 +529,60 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 					   &_ports);
 		break;
 	default:
-		_ports[0] = _ports[1] = 0;
-		ports = _ports;
-		break;
+		return 0;
 	}
 	if (!ports)
 		return -1;
-	if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT)
-		dst->src_port = ports[0];
+	*key = (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT) ? (ports[0] << 16) : 0;
 	if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT)
-		dst->dst_port = ports[1];
+		*key |= ports[1];
 	return 0;
 }
 
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+		   const u32 *key, u32 hash)
+{
+	struct dsthash_ent *ent = NULL;
+
+	spin_lock(&ht->hlock);
+
+	/* initialize hash with random val at the time we allocate
+	 * the first hashtable entry. In this case, recompute hash value.
+	 */
+	if (unlikely(!ht->rnd_initialized)) {
+		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+		ht->rnd_initialized = true;
+		hash = hash_func(ht, key);
+	}
+
+	if (ht->cfg.max && ht->count >= ht->cfg.max) {
+		/* FIXME: do something. question is what.. */
+		if (net_ratelimit())
+			pr_warning("xt_hashlimit: max count of %u reached\n",
+				   ht->cfg.max);
+	} else 
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+	if (!ent) {
+		if (net_ratelimit())
+			pr_err("xt_hashlimit: can't allocate dsthash_ent\n");
+	} else {
+		memcpy(&ent->key, key, ht->sz32 * sizeof(u32));
+		spin_lock_init(&ent->lock);
+		ent->expires = jiffies + msecs_to_jiffies(ht->cfg.expire);
+		ent->rateinfo.prev = jiffies;
+		ent->rateinfo.credit = ht->credit_cap;
+		ent->rateinfo.cost = user2credits(ht->cfg.avg);
+		spin_lock(&ent->lock);
+
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash]);
+		ht->count++;
+	}
+	spin_unlock(&ht->hlock);
+	return ent;
+}
+
 static bool
 hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
@@ -583,43 +590,36 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 	struct xt_hashlimit_htable *hinfo = r->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
-	struct dsthash_dst dst;
+	u32 key[MAXKEYSIZE], hash;
 
-	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
+	if (hashlimit_init_key(hinfo, key, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
-	dh = dsthash_find(hinfo, &dst);
+	hash = hash_func(hinfo, key);
+	rcu_read_lock_bh();
+	dh = dsthash_find(hinfo, key, hash);
 	if (!dh) {
-		dh = dsthash_alloc_init(hinfo, &dst);
+		dh = dsthash_alloc_init(hinfo, key, hash);
 		if (!dh) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
-		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-		dh->rateinfo.prev = jiffies;
-		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-						   hinfo->cfg.burst);
-		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
-						       hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now);
+		rateinfo_recalc(dh, now, hinfo);
 	}
 
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* We're underlimit. */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return true;
 	}
 
-	spin_unlock_bh(&hinfo->lock);
-
-	/* default case: we're overlimit, thus don't match */
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	return false;
 
 hotdrop:
@@ -634,45 +634,41 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
-	struct dsthash_dst dst;
+	u32 key[MAXKEYSIZE], hash;
 
-	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
+	if (hashlimit_init_key(hinfo, key, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
-	dh = dsthash_find(hinfo, &dst);
+	hash = hash_func(hinfo, key);
+	rcu_read_lock_bh();
+	dh = dsthash_find(hinfo, key, hash);
 	if (dh == NULL) {
-		dh = dsthash_alloc_init(hinfo, &dst);
+		dh = dsthash_alloc_init(hinfo, key, hash);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
 
-		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-		dh->rateinfo.prev = jiffies;
-		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-		                      hinfo->cfg.burst);
-		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
-		                          hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now);
+		rateinfo_recalc(dh, now, hinfo);
 	}
 
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
- hotdrop:
+hotdrop:
 	*par->hotdrop = true;
 	return false;
 }
@@ -685,7 +681,7 @@ static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par)
 	/* Check for overflow. */
 	if (r->cfg.burst == 0 ||
 	    user2credits(r->cfg.avg * r->cfg.burst) < user2credits(r->cfg.avg)) {
-		printk(KERN_ERR "xt_hashlimit: overflow, try lower: %u/%u\n",
+		pr_err("xt_hashlimit: overflow, try lower: %u/%u\n",
 		       r->cfg.avg, r->cfg.burst);
 		return false;
 	}
@@ -722,7 +718,7 @@ static bool hashlimit_mt_check(const struct xt_mtchk_param *par)
 	if (info->cfg.burst == 0 ||
 	    user2credits(info->cfg.avg * info->cfg.burst) <
 	    user2credits(info->cfg.avg)) {
-		printk(KERN_ERR "xt_hashlimit: overflow, try lower: %u/%u\n",
+		pr_err("xt_hashlimit: overflow, try lower: %u/%u\n",
 		       info->cfg.avg, info->cfg.burst);
 		return false;
 	}
@@ -843,16 +839,17 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 
 /* PROC stuff */
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
-	__acquires(htable->lock)
+	__acquires(htable->hlock)
 {
 	struct xt_hashlimit_htable *htable = s->private;
-	unsigned int *bucket;
+	unsigned int *bucket = kmalloc(sizeof(unsigned int), GFP_KERNEL);
 
-	spin_lock_bh(&htable->lock);
-	if (*pos >= htable->cfg.size)
-		return NULL;
+	spin_lock_bh(&htable->hlock);
 
-	bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+	if (*pos >= htable->cfg.size) {
+		kfree(bucket);
+		return NULL;
+	}
 	if (!bucket)
 		return ERR_PTR(-ENOMEM);
 
@@ -874,46 +871,68 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 }
 
 static void dl_seq_stop(struct seq_file *s, void *v)
-	__releases(htable->lock)
+	__releases(htable->hlock)
 {
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket = (unsigned int *)v;
 
-	kfree(bucket);
-	spin_unlock_bh(&htable->lock);
+	if (!IS_ERR(bucket))
+		kfree(bucket);
+	spin_unlock_bh(&htable->hlock);
 }
 
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
-				   struct seq_file *s)
+				   struct seq_file *s, struct xt_hashlimit_htable *htable)
 {
+	int res = 0;
+	__be32 ports;
+	const u32 *key = ent->key;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies);
+	rateinfo_recalc(ent, jiffies, htable);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		{
+		__be32 src = (htable->cfg.mode & XT_HASHLIMIT_HASH_DIP) ? *key++ : 0;
+		__be32 dst = (htable->cfg.mode & XT_HASHLIMIT_HASH_SIP) ? *key++ : 0;
+
+		ports = (htable->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)) ?
+			*key : 0;
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
-				 &ent->dst.ip.src,
-				 ntohs(ent->dst.src_port),
-				 &ent->dst.ip.dst,
-				 ntohs(ent->dst.dst_port),
-				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 &src, ntohs(ports >> 16),
+				 &dst, ntohs(ports & 0xFFFF),
+				 ent->rateinfo.credit, htable->credit_cap,
 				 ent->rateinfo.cost);
+		break;
+		}
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		{
+		struct in6_addr src;
+		struct in6_addr dst;
+		if (htable->cfg.mode & XT_HASHLIMIT_HASH_DIP)
+			key = ipv6_addr_rebuild(key, src.s6_addr32, htable->cfg.dstmask >> 5);
+		if (htable->cfg.mode & XT_HASHLIMIT_HASH_SIP)
+			key = ipv6_addr_rebuild(key, dst.s6_addr32, htable->cfg.srcmask >> 5);
+		ports = (htable->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)) ?
+			*key : 0;
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
-				 &ent->dst.ip6.src,
-				 ntohs(ent->dst.src_port),
-				 &ent->dst.ip6.dst,
-				 ntohs(ent->dst.dst_port),
-				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 &src, ntohs(ports >> 16),
+				 &dst, ntohs(ports & 0xFFFF),
+				 ent->rateinfo.credit, htable->credit_cap,
 				 ent->rateinfo.cost);
+		break;
+		}
 #endif
 	default:
 		BUG();
-		return 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -925,7 +944,7 @@ static int dl_seq_show(struct seq_file *s, void *v)
 
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
-			if (dl_seq_real_show(ent, htable->family, s))
+			if (dl_seq_real_show(ent, htable->family, s, htable))
 				return -1;
 	}
 	return 0;
@@ -1022,7 +1041,7 @@ static int __init hashlimit_mt_init(void)
 					    sizeof(struct dsthash_ent), 0, 0,
 					    NULL);
 	if (!hashlimit_cachep) {
-		printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
+		pr_err("xt_hashlimit: unable to create slab cache\n");
 		goto err2;
 	}
 	return 0;
@@ -1037,9 +1056,9 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html