Stephen Hemminger a écrit : > Make netfilter tables that use x_tables (iptables, ip6tables, arptables) > operatate without locking on the receive path. > Replace existing reader/writer lock with Read-Copy-Update to > elminate the overhead of a read lock on each incoming packet. > This should reduce the overhead of iptables especially on SMP > systems. > > The previous code used a reader-writer lock for two purposes. > The first was to ensure that the xt_table_info reference was not in > process of being changed. Since xt_table_info is only freed via one > routine, it was a direct conversion to RCU. > > The other use of the reader-writer lock was to to block changes > to counters while they were being read. This is handled by swapping in > a new set of counter values and then summing the old ones. The sum > is then restored back on a single cpu. > > Signed-off-by: Stephen Hemminger <shemminger@xxxxxxxxxx> > > --- > include/linux/netfilter/x_tables.h | 13 ++++ > net/ipv4/netfilter/arp_tables.c | 92 ++++++++++++++++++++++++----------- > net/ipv4/netfilter/ip_tables.c | 97 ++++++++++++++++++++++++------------- > net/ipv6/netfilter/ip6_tables.c | 97 +++++++++++++++++++++++-------------- > net/netfilter/x_tables.c | 67 +++++++++++++++++++++---- > 5 files changed, 258 insertions(+), 108 deletions(-) > > --- a/include/linux/netfilter/x_tables.h 2009-02-02 15:06:39.893751845 -0800 > +++ b/include/linux/netfilter/x_tables.h 2009-02-03 15:44:21.743663216 -0800 > @@ -353,7 +353,7 @@ struct xt_table > unsigned int valid_hooks; > > /* Lock for the curtain */ > - rwlock_t lock; > + struct mutex lock; > > /* Man behind the curtain... */ > struct xt_table_info *private; > @@ -383,9 +383,15 @@ struct xt_table_info > unsigned int hook_entry[NF_INET_NUMHOOKS]; > unsigned int underflow[NF_INET_NUMHOOKS]; > > + /* For the dustman... */ > + union { > + struct rcu_head rcu; > + struct work_struct work; > + }; > + > /* ipt_entry tables: one per CPU */ > /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ > - char *entries[1]; > + void *entries[1]; > }; > > #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ > @@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne > > extern struct xt_table_info *xt_alloc_table_info(unsigned int size); > extern void xt_free_table_info(struct xt_table_info *info); > +extern void xt_zero_table_entries(struct xt_table_info *info); > +extern void xt_swap_table_entries(struct xt_table_info *old, > + struct xt_table_info *new); > > #ifdef CONFIG_COMPAT > #include <net/compat.h> > --- a/net/ipv4/netfilter/ip_tables.c 2009-02-02 15:06:29.684249364 -0800 > +++ b/net/ipv4/netfilter/ip_tables.c 2009-02-03 15:52:32.047583686 -0800 > @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb, > mtpar.family = tgpar.family = NFPROTO_IPV4; > tgpar.hooknum = hook; > > - read_lock_bh(&table->lock); > IP_NF_ASSERT(table->valid_hooks & (1 << hook)); > - private = table->private; > - table_base = (void *)private->entries[smp_processor_id()]; > + > + rcu_read_lock_bh(); > + private = rcu_dereference(table->private); > + table_base = rcu_dereference(private->entries[smp_processor_id()]); > + > e = get_entry(table_base, private->hook_entry[hook]); > > /* For return from builtin chain */ > @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb, > } > } while (!hotdrop); > > - read_unlock_bh(&table->lock); > + rcu_read_unlock_bh(); > > #ifdef DEBUG_ALLOW_ALL > return NF_ACCEPT; > @@ -924,13 +926,45 @@ get_counters(const struct xt_table_info > counters, > &i); > } > + > +} > + > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct ipt_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > +/* Take values from counters and add them back onto the current cpu */ > +static void put_counters(struct xt_table_info *t, > + const struct xt_counters counters[]) > +{ > + unsigned int i, cpu; > + > + local_bh_disable(); > + cpu = smp_processor_id(); > + i = 0; > + IPT_ENTRY_ITERATE(t->entries[cpu], > + t->size, > + add_counter_to_entry, > + counters, > + &i); > + local_bh_enable(); > } > > static struct xt_counters * alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > - const struct xt_table_info *private = table->private; > + struct xt_table_info *private = table->private; > + struct xt_table_info *tmp; > > /* We need atomic snapshot of counters: rest doesn't change > (other than comefrom, which userspace doesn't care > @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - return ERR_PTR(-ENOMEM); > + goto nomem; > + > + tmp = xt_alloc_table_info(private->size); > + if (!tmp) > + goto free_counters; > + > + xt_zero_table_entries(tmp); This is not correct. We must copy rules and zero counters on the copied stuff. > + > + mutex_lock(&table->lock); > + xt_swap_table_entries(private, tmp); > + synchronize_net(); /* Wait until smoke has cleared */ > > - /* First, sum counters... */ > - write_lock_bh(&table->lock); > - get_counters(private, counters); > - write_unlock_bh(&table->lock); > + get_counters(tmp, counters); Yes, tmp now hold previous pointers > + put_counters(private, counters); > + mutex_unlock(&table->lock); > + > + xt_free_table_info(tmp); > > return counters; > + > + free_counters: > + vfree(counters); > + nomem: > + return ERR_PTR(-ENOMEM); > } > > static int > @@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user > return ret; > } > > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static int > -add_counter_to_entry(struct ipt_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > -#if 0 > - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", > - *i, > - (long unsigned int)e->counters.pcnt, > - (long unsigned int)e->counters.bcnt, > - (long unsigned int)addme[*i].pcnt, > - (long unsigned int)addme[*i].bcnt); > -#endif > - > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > > static int > do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) > @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __ > goto free; > } > > - write_lock_bh(&t->lock); > + mutex_lock(&t->lock); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > + preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > loc_cpu_entry = private->entries[raw_smp_processor_id()]; > @@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __ > add_counter_to_entry, > paddc, > &i); > + preempt_enable(); > unlock_up_free: > - write_unlock_bh(&t->lock); > + mutex_unlock(&t->lock); > xt_table_unlock(t); > module_put(t->me); > free: > --- a/net/netfilter/x_tables.c 2009-02-02 15:06:29.708249745 -0800 > +++ b/net/netfilter/x_tables.c 2009-02-03 15:44:21.743663216 -0800 > @@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf > } > EXPORT_SYMBOL(xt_alloc_table_info); > > -void xt_free_table_info(struct xt_table_info *info) > +/* Zero out entries */ > +void xt_zero_table_entries(struct xt_table_info *info) > { > - int cpu; > + unsigned int cpu; > + > + for_each_possible_cpu(cpu) > + memset(info->entries[cpu], 0, info->size); > +} > +EXPORT_SYMBOL_GPL(xt_zero_table_entries); Hum, you forgot entries[cpu] points to quite complex data set, with iptables rules, countaining counters... Only counters must be zeroed, one by one. You thus need an ITERATE invocation... I wont be able to make the incremental patch (too busy @work at this moment), I am sorry :( -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html