On Tue, Feb 10, 2009 at 9:52 AM, Stephen Hemminger <shemminger@xxxxxxxxxx> wrote: > The reader/writer lock in ip_tables is acquired in the critical path of > processing packets and is one of the reasons just loading iptables can cause > a 20% performance loss. The rwlock serves two functions: > > 1) it prevents changes to table state (xt_replace) while table is in use. > This is now handled by doing rcu on the xt_table. When table is > replaced, the new table(s) are put in and the old one table(s) are freed > after RCU period. > > 2) it provides synchronization when accesing the counter values. > This is now handled by swapping in new table_info entries for each cpu > then summing the old values, and putting the result back onto one > cpu. On a busy system it may cause sampling to occur at different > times on each cpu, but no packet/byte counts are lost in the process. > > Signed-off-by: Stephen Hemminger <shemminger@xxxxxxxxxx> > > --- > include/linux/netfilter/x_tables.h | 6 + > net/ipv4/netfilter/arp_tables.c | 114 ++++++++++++++++++++++++++--------- > net/ipv4/netfilter/ip_tables.c | 120 ++++++++++++++++++++++++++----------- > net/ipv6/netfilter/ip6_tables.c | 119 +++++++++++++++++++++++++----------- > net/netfilter/x_tables.c | 26 ++++++-- > 5 files changed, 283 insertions(+), 102 deletions(-) > > --- a/include/linux/netfilter/x_tables.h 2009-02-09 08:31:47.955781543 -0800 > +++ b/include/linux/netfilter/x_tables.h 2009-02-09 08:32:41.202805664 -0800 > @@ -353,7 +353,7 @@ struct xt_table > unsigned int valid_hooks; > > /* Lock for the curtain */ > - rwlock_t lock; > + struct mutex lock; > > /* Man behind the curtain... */ > struct xt_table_info *private; > @@ -385,7 +385,7 @@ struct xt_table_info > > /* ipt_entry tables: one per CPU */ > /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ > - char *entries[1]; > + void *entries[1]; > }; > > #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ > @@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *ne > > extern struct xt_table_info *xt_alloc_table_info(unsigned int size); > extern void xt_free_table_info(struct xt_table_info *info); > +extern void xt_table_entry_swap_rcu(struct xt_table_info *old, > + struct xt_table_info *new); > > #ifdef CONFIG_COMPAT > #include <net/compat.h> > --- a/net/ipv4/netfilter/ip_tables.c 2009-02-09 08:30:58.606781650 -0800 > +++ b/net/ipv4/netfilter/ip_tables.c 2009-02-09 09:00:59.651532539 -0800 > @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb, > mtpar.family = tgpar.family = NFPROTO_IPV4; > tgpar.hooknum = hook; > > - read_lock_bh(&table->lock); > IP_NF_ASSERT(table->valid_hooks & (1 << hook)); > - private = table->private; > - table_base = (void *)private->entries[smp_processor_id()]; > + > + rcu_read_lock(); Any reason why we don't disable bh here? Is it because the different counter entries are not touched by the receive path? seems inconsistent with the other arp/ip6_do_table routines which do a bh disable version. > + private = rcu_dereference(table->private); > + table_base = rcu_dereference(private->entries[smp_processor_id()]); > + > e = get_entry(table_base, private->hook_entry[hook]); > > /* For return from builtin chain */ > @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb, > } > } while (!hotdrop); > > - read_unlock_bh(&table->lock); > + rcu_read_unlock(); > > #ifdef DEBUG_ALLOW_ALL > return NF_ACCEPT; > @@ -924,13 +926,68 @@ get_counters(const struct xt_table_info > counters, > &i); > } > + > +} > + > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct ipt_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > +/* Take values from counters and add them back onto the current cpu */ > +static void put_counters(struct xt_table_info *t, > + const struct xt_counters counters[]) > +{ > + unsigned int i, cpu; > + > + local_bh_disable(); > + cpu = smp_processor_id(); > + i = 0; > + IPT_ENTRY_ITERATE(t->entries[cpu], > + t->size, > + add_counter_to_entry, > + counters, > + &i); > + local_bh_enable(); > +} > + > + > +static inline int > +zero_entry_counter(struct ipt_entry *e, void *arg) > +{ > + e->counters.bcnt = 0; > + e->counters.pcnt = 0; > + return 0; > +} > + > +static void > +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > +{ probably should be named clone_entries? > + unsigned int cpu; > + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > + > + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > + for_each_possible_cpu(cpu) { > + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > + IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > + zero_entry_counter, NULL); > + } > } > > static struct xt_counters * alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > - const struct xt_table_info *private = table->private; > + struct xt_table_info *private = table->private; > + struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > (other than comefrom, which userspace doesn't care > @@ -939,14 +996,30 @@ static struct xt_counters * alloc_counte > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - return ERR_PTR(-ENOMEM); > + goto nomem; > + > + info = xt_alloc_table_info(private->size); > + if (!info) > + goto free_counters; > + > + clone_counters(info, private); > > - /* First, sum counters... */ > - write_lock_bh(&table->lock); > - get_counters(private, counters); > - write_unlock_bh(&table->lock); > + mutex_lock(&table->lock); > + xt_table_entry_swap_rcu(private, info); > + synchronize_net(); /* Wait until smoke has cleared */ > + > + get_counters(info, counters); > + put_counters(private, counters); > + mutex_unlock(&table->lock); > + > + xt_free_table_info(info); > > return counters; > + > + free_counters: > + vfree(counters); > + nomem: > + return ERR_PTR(-ENOMEM); > } > > static int > @@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user > return ret; > } > > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static int > -add_counter_to_entry(struct ipt_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > -#if 0 > - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", > - *i, > - (long unsigned int)e->counters.pcnt, > - (long unsigned int)e->counters.bcnt, > - (long unsigned int)addme[*i].pcnt, > - (long unsigned int)addme[*i].bcnt); > -#endif > - > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > > static int > do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) > @@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __ > goto free; > } > > - write_lock_bh(&t->lock); > + mutex_lock(&t->lock); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > + preempt_disable(); Should be local_bh_disable since the counters could be touched in that path? > i = 0; > /* Choose the copy that is on our node */ > loc_cpu_entry = private->entries[raw_smp_processor_id()]; > @@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __ > add_counter_to_entry, > paddc, > &i); > + preempt_enable(); > unlock_up_free: > - write_unlock_bh(&t->lock); > + mutex_unlock(&t->lock); > xt_table_unlock(t); > module_put(t->me); > free: > --- a/net/netfilter/x_tables.c 2009-02-09 08:30:58.642782131 -0800 > +++ b/net/netfilter/x_tables.c 2009-02-09 08:32:24.018308401 -0800 > @@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_ > } > EXPORT_SYMBOL(xt_free_table_info); > > +void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, > + struct xt_table_info *newinfo) > +{ > + unsigned int cpu; > + > + for_each_possible_cpu(cpu) { > + void *p = oldinfo->entries[cpu]; > + rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); > + newinfo->entries[cpu] = p; > + } > + > +} > +EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); > + > /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ > struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, > const char *name) > @@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table, > struct xt_table_info *oldinfo, *private; > > /* Do the substitution. */ > - write_lock_bh(&table->lock); > + mutex_lock(&table->lock); > private = table->private; > /* Check inside lock: is the old number correct? */ > if (num_counters != private->number) { > duprintf("num_counters != table->private->number (%u/%u)\n", > num_counters, private->number); > - write_unlock_bh(&table->lock); > + mutex_unlock(&table->lock); > *error = -EAGAIN; > return NULL; > } > oldinfo = private; > - table->private = newinfo; > + rcu_assign_pointer(table->private, newinfo); > newinfo->initial_entries = oldinfo->initial_entries; > - write_unlock_bh(&table->lock); > + mutex_unlock(&table->lock); > > + synchronize_net(); > return oldinfo; > } > EXPORT_SYMBOL_GPL(xt_replace_table); > @@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struc > > /* Simplifies replace_table code. */ > table->private = bootstrap; > - rwlock_init(&table->lock); > + mutex_init(&table->lock); > + > if (!xt_replace_table(table, 0, newinfo, &ret)) > goto unlock; > > --- a/net/ipv4/netfilter/arp_tables.c 2009-02-09 08:30:58.630817607 -0800 > +++ b/net/ipv4/netfilter/arp_tables.c 2009-02-09 09:07:18.120931959 -0800 > @@ -237,9 +237,10 @@ unsigned int arpt_do_table(struct sk_buf > indev = in ? in->name : nulldevname; > outdev = out ? out->name : nulldevname; > > - read_lock_bh(&table->lock); > - private = table->private; > - table_base = (void *)private->entries[smp_processor_id()]; > + rcu_read_lock_bh(); > + private = rcu_dereference(table->private); > + table_base = rcu_dereference(private->entries[smp_processor_id()]); > + > e = get_entry(table_base, private->hook_entry[hook]); > back = get_entry(table_base, private->underflow[hook]); > > @@ -311,7 +312,8 @@ unsigned int arpt_do_table(struct sk_buf > e = (void *)e + e->next_offset; > } > } while (!hotdrop); > - read_unlock_bh(&table->lock); > + > + rcu_read_unlock_bh(); > > if (hotdrop) > return NF_DROP; > @@ -714,11 +716,65 @@ static void get_counters(const struct xt > } > } > > -static inline struct xt_counters *alloc_counters(struct xt_table *table) > + > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct arpt_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > +/* Take values from counters and add them back onto the current cpu */ > +static void put_counters(struct xt_table_info *t, > + const struct xt_counters counters[]) > +{ > + unsigned int i, cpu; > + > + local_bh_disable(); > + cpu = smp_processor_id(); > + i = 0; > + ARPT_ENTRY_ITERATE(t->entries[cpu], > + t->size, > + add_counter_to_entry, > + counters, > + &i); > + local_bh_enable(); > +} > + > +static inline int > +zero_entry_counter(struct arpt_entry *e, void *arg) > +{ > + e->counters.bcnt = 0; > + e->counters.pcnt = 0; > + return 0; > +} > + > +static void > +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > +{ > + unsigned int cpu; > + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > + > + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > + for_each_possible_cpu(cpu) { > + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > + ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > + zero_entry_counter, NULL); > + } > +} > + > +static struct xt_counters *alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > - const struct xt_table_info *private = table->private; > + struct xt_table_info *private = table->private; > + struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > * (other than comefrom, which userspace doesn't care > @@ -728,14 +784,30 @@ static inline struct xt_counters *alloc_ > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - return ERR_PTR(-ENOMEM); > + goto nomem; > > - /* First, sum counters... */ > - write_lock_bh(&table->lock); > - get_counters(private, counters); > - write_unlock_bh(&table->lock); > + info = xt_alloc_table_info(private->size); > + if (!info) > + goto free_counters; > + > + clone_counters(info, private); > + > + mutex_lock(&table->lock); > + xt_table_entry_swap_rcu(private, info); > + synchronize_net(); /* Wait until smoke has cleared */ > + > + get_counters(info, counters); > + put_counters(private, counters); > + mutex_unlock(&table->lock); > + > + xt_free_table_info(info); > > return counters; > + > + free_counters: > + vfree(counters); > + nomem: > + return ERR_PTR(-ENOMEM); > } > > static int copy_entries_to_user(unsigned int total_size, > @@ -1075,20 +1147,6 @@ static int do_replace(struct net *net, v > return ret; > } > > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. > - */ > -static inline int add_counter_to_entry(struct arpt_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > - > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > - > static int do_add_counters(struct net *net, void __user *user, unsigned int len, > int compat) > { > @@ -1148,13 +1206,14 @@ static int do_add_counters(struct net *n > goto free; > } > > - write_lock_bh(&t->lock); > + mutex_lock(&t->lock); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > + preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > loc_cpu_entry = private->entries[smp_processor_id()]; > @@ -1164,7 +1223,8 @@ static int do_add_counters(struct net *n > paddc, > &i); > unlock_up_free: > - write_unlock_bh(&t->lock); > + mutex_unlock(&t->lock); > + > xt_table_unlock(t); > module_put(t->me); > free: > --- a/net/ipv6/netfilter/ip6_tables.c 2009-02-09 08:30:58.662807566 -0800 > +++ b/net/ipv6/netfilter/ip6_tables.c 2009-02-09 09:00:50.195531719 -0800 > @@ -373,10 +373,12 @@ ip6t_do_table(struct sk_buff *skb, > mtpar.family = tgpar.family = NFPROTO_IPV6; > tgpar.hooknum = hook; > > - read_lock_bh(&table->lock); > IP_NF_ASSERT(table->valid_hooks & (1 << hook)); > - private = table->private; > - table_base = (void *)private->entries[smp_processor_id()]; > + > + rcu_read_lock_bh(); > + private = rcu_dereference(table->private); > + table_base = rcu_dereference(private->entries[smp_processor_id()]); > + > e = get_entry(table_base, private->hook_entry[hook]); > > /* For return from builtin chain */ > @@ -474,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb, > #ifdef CONFIG_NETFILTER_DEBUG > ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; > #endif > - read_unlock_bh(&table->lock); > + rcu_read_unlock_bh(); > > #ifdef DEBUG_ALLOW_ALL > return NF_ACCEPT; > @@ -955,11 +957,64 @@ get_counters(const struct xt_table_info > } > } > > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct ip6t_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > +/* Take values from counters and add them back onto the current cpu */ > +static void put_counters(struct xt_table_info *t, > + const struct xt_counters counters[]) > +{ > + unsigned int i, cpu; > + > + local_bh_disable(); > + cpu = smp_processor_id(); > + i = 0; > + IP6T_ENTRY_ITERATE(t->entries[cpu], > + t->size, > + add_counter_to_entry, > + counters, > + &i); > + local_bh_enable(); > +} > + > +static inline int > +zero_entry_counter(struct ip6t_entry *e, void *arg) > +{ > + e->counters.bcnt = 0; > + e->counters.pcnt = 0; > + return 0; > +} > + > +static void > +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > +{ > + unsigned int cpu; > + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > + > + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > + for_each_possible_cpu(cpu) { > + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > + IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > + zero_entry_counter, NULL); > + } > +} > + > static struct xt_counters *alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > - const struct xt_table_info *private = table->private; > + struct xt_table_info *private = table->private; > + struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > (other than comefrom, which userspace doesn't care > @@ -968,14 +1023,28 @@ static struct xt_counters *alloc_counter > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - return ERR_PTR(-ENOMEM); > + goto nomem; > > - /* First, sum counters... */ > - write_lock_bh(&table->lock); > - get_counters(private, counters); > - write_unlock_bh(&table->lock); > + info = xt_alloc_table_info(private->size); > + if (!info) > + goto free_counters; > + > + clone_counters(info, private); > > - return counters; > + mutex_lock(&table->lock); > + xt_table_entry_swap_rcu(private, info); > + synchronize_net(); /* Wait until smoke has cleared */ > + > + get_counters(info, counters); > + put_counters(private, counters); > + mutex_unlock(&table->lock); > + > + xt_free_table_info(info); > + > + free_counters: > + vfree(counters); > + nomem: > + return ERR_PTR(-ENOMEM); > } > > static int > @@ -1342,28 +1411,6 @@ do_replace(struct net *net, void __user > return ret; > } > > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static inline int > -add_counter_to_entry(struct ip6t_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > -#if 0 > - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", > - *i, > - (long unsigned int)e->counters.pcnt, > - (long unsigned int)e->counters.bcnt, > - (long unsigned int)addme[*i].pcnt, > - (long unsigned int)addme[*i].bcnt); > -#endif > - > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > - > static int > do_add_counters(struct net *net, void __user *user, unsigned int len, > int compat) > @@ -1424,13 +1471,14 @@ do_add_counters(struct net *net, void __ > goto free; > } > > - write_lock_bh(&t->lock); > + mutex_lock(&t->lock); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > + preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > loc_cpu_entry = private->entries[raw_smp_processor_id()]; > @@ -1439,8 +1487,9 @@ do_add_counters(struct net *net, void __ > add_counter_to_entry, > paddc, > &i); > + preempt_enable(); > unlock_up_free: > - write_unlock_bh(&t->lock); > + mutex_unlock(&t->lock); > xt_table_unlock(t); > module_put(t->me); > free: > -- I applied and tested the patch and saw no issues. oprofile results confirm significant reduction in CPU cycles spent in ipt_do_table while running hundreds of threads of netperf traffic. Thanks! -Ranjit > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html