[PATCH net-next-2.6] netfilter: ip_tables: dont block BH while reading counters

Eric Dumazet <eric.dumazet@xxxxxxxxx> · Thu, 16 Dec 2010 17:07:05 +0100

Le jeudi 16 dÃcembre 2010 Ã 16:02 +0100, Eric Dumazet a Ãcrit :
> Le jeudi 16 dÃcembre 2010 Ã 15:29 +0100, Eric Dumazet a Ãcrit :
> > Le jeudi 16 dÃcembre 2010 Ã 15:24 +0100, Jesper Dangaard Brouer a
> > Ãcrit :
> > 
> > > In my case I think this will not help. I'll kill the cache anyways, as
> > > the ruleset is 19MB and my CPU cache is 8MB.
> > > 
> > > 
> > 
> > Yep ;)
> > 
> > By the way, you speak of a 'possible regression', but we always masked
> > BH while doing get_counters().
> > 
> > Only very recent kernels are masking them for each unit (cpu) of work.
> > 
> > There was attempt to use a lockless read for each counter (using a
> > seqlock), but it was not completed. I guess we could do something to
> > ressurect this idea.
> > 
> > 
> 
> Something like following patch :

Here is a tested version : no need for a (buggy in previous patch)
memset() if we use vzalloc()

Note : We miss a this_cpu_write_seqcount_begin() interface.
I'll bug lkml to get it asap.

Thanks

[PATCH net-next-2.6] netfilter: ip_tables: dont block BH while reading counters

Using "iptables -L" with a lot of rules might have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.

Switch to a per_cpu seqcount scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).
This slow down a bit each counter updates, using two extra increments.

Note : We miss a this_cpu_write_seqcount_begin() interface, so we are
forced to compute the address of our per_cpu seqcount to call
write_seqcount_begin(). Once available, overhead will be exactly two
"incl %gs:counters_seq" instructions on x86

Reported-by: Jesper Dangaard Brouer <hawk@xxxxxxx>
Signed-off-by: Eric Dumazet <eric.dumazet@xxxxxxxxx>
---
 net/ipv4/netfilter/ip_tables.c |   52 ++++++++++++-------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..ae18ead 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -293,6 +293,8 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
 	return (void *)entry + entry->next_offset;
 }
 
+static DEFINE_PER_CPU(seqcount_t, counters_seq);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -311,6 +313,7 @@ ipt_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	seqcount_t *seq;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -364,7 +367,11 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
+		seq = &__get_cpu_var(counters_seq);
+		/* could be faster if we had this_cpu_write_seqcount_begin() */
+		write_seqcount_begin(seq);
 		ADD_COUNTER(e->counters, skb->len, 1);
+		write_seqcount_end(seq);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -884,42 +891,25 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU.
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqcount_t *seq = &per_cpu(counters_seq, cpu);
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqcount_begin(seq);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqcount_retry(seq, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +922,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1203,7 +1193,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html