Yi-Hung Wei <yihung.wei@xxxxxxxxx> wrote: > From: Florian Westphal <fw@xxxxxxxxx> > > This patch adds list lock to 'struct nf_conncount_list' so that we can > alter the lists containing the individual connections without holding the > main tree lock. It would be useful when we only need to add/remove to/from > a list without allocate/remove a node in the tree. > > This patch also use RCU for the initial tree search. > > We also update nft_connlimit accordingly since we longer need to maintain > a list lock in nft_connlimit now. > > Signed-off-by: Florian Westphal <fw@xxxxxxxxx> > Signed-off-by: Yi-Hung Wei <yihung.wei@xxxxxxxxx> > --- > include/net/netfilter/nf_conntrack_count.h | 3 +- > net/netfilter/nf_conncount.c | 98 +++++++++++++++++++----------- > net/netfilter/nft_connlimit.c | 15 +---- > 3 files changed, 66 insertions(+), 50 deletions(-) > +static void __tree_nodes_free(struct rcu_head *h) > +{ > + struct nf_conncount_rb *rbconn; > + > + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); > + kmem_cache_free(conncount_rb_cachep, rbconn); > +} > + > static void tree_nodes_free(struct rb_root *root, > struct nf_conncount_rb *gc_nodes[], > unsigned int gc_count) > @@ -215,7 +251,7 @@ static void tree_nodes_free(struct rb_root *root, > while (gc_count) { > rbconn = gc_nodes[--gc_count]; > rb_erase(&rbconn->node, root); > - kmem_cache_free(conncount_rb_cachep, rbconn); > + call_rcu(&rbconn->rcu_head, __tree_nodes_free); > } > } > > @@ -293,62 +329,59 @@ count_tree(struct net *net, > { > struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; > struct rb_root *root; > - struct rb_node **rbnode, *parent; > + struct rb_node *parent; > struct nf_conncount_rb *rbconn; > unsigned int gc_count, hash; > bool no_gc = false; > - unsigned int count = 0; > u8 keylen = data->keylen; > > hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; > root = &data->root[hash]; > > - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > restart: > gc_count = 0; > - parent = NULL; > - rbnode = &(root->rb_node); > - while (*rbnode) { > + parent = rcu_dereference_raw(root->rb_node); > + while (parent) { > int diff; > bool addit; > > - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); > + rbconn = rb_entry(parent, struct nf_conncount_rb, node); > > - parent = *rbnode; > diff = key_diff(key, rbconn->key, keylen); > if (diff < 0) { > - rbnode = &((*rbnode)->rb_left); > + parent = rcu_dereference_raw(parent->rb_left); > } else if (diff > 0) { > - rbnode = &((*rbnode)->rb_right); > + parent = rcu_dereference_raw(parent->rb_right); > } else { > /* same source network -> be counted! */ > nf_conncount_lookup(net, &rbconn->list, tuple, zone, > &addit); > - count = rbconn->list.count; > > + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > tree_nodes_free(root, gc_nodes, gc_count); > + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); This looks racy to me. There could be another cpu calling count_tree(), and place same node in gc_nodes[]. I'd suggest to get rid of the tree_nodes freeing here and only do this in insert_tree(), where entire traversal is protected by the spinlock. > if (!addit) > - goto out_unlock; > + return rbconn->list.count; > > if (!nf_conncount_add(&rbconn->list, tuple, zone)) > - count = 0; /* hotdrop */ > - goto out_unlock; > + return 0; /* hotdrop */ > > - count++; > - goto out_unlock; > + return rbconn->list.count; > } > > if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) > continue; > > - nf_conncount_gc_list(net, &rbconn->list); > - if (list_empty(&rbconn->list.head)) > + if (nf_conncount_gc_list(net, &rbconn->list)) > gc_nodes[gc_count++] = rbconn; > } > > if (gc_count) { > no_gc = true; > + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > tree_nodes_free(root, gc_nodes, gc_count); > + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > /* tree_node_free before new allocation permits > * allocator to re-use newly free'd object. > * > @@ -358,20 +391,15 @@ count_tree(struct net *net, > goto restart; > } > > - count = 0; > if (!tuple) > - goto out_unlock; > + return 0; > > - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > return insert_tree(root, hash, key, keylen, tuple, zone); > - > -out_unlock: > - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); > - return count; > } > > /* Count and return number of conntrack entries in 'net' with particular 'key'. > * If 'tuple' is not null, insert it into the accounting data structure. > + * Call with RCU read lock. > */ > unsigned int nf_conncount_count(struct net *net, > struct nf_conncount_data *data, > diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c > index 37c52ae06741..3dbc737915da 100644 > --- a/net/netfilter/nft_connlimit.c > +++ b/net/netfilter/nft_connlimit.c > @@ -14,7 +14,6 @@ > #include <net/netfilter/nf_conntrack_zones.h> > > struct nft_connlimit { > - spinlock_t lock; > struct nf_conncount_list list; > u32 limit; > bool invert; > @@ -45,7 +44,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, > return; > } > > - spin_lock_bh(&priv->lock); > nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, > &addit); > count = priv->list.count; > @@ -55,12 +53,10 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, > > if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { > regs->verdict.code = NF_DROP; > - spin_unlock_bh(&priv->lock); > return; > } > count++; > out: > - spin_unlock_bh(&priv->lock); > > if ((count > priv->limit) ^ priv->invert) { > regs->verdict.code = NFT_BREAK; > @@ -88,7 +84,6 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, > invert = true; > } > > - spin_lock_init(&priv->lock); > nf_conncount_list_init(&priv->list); > priv->limit = limit; > priv->invert = invert; > @@ -213,7 +208,6 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) > struct nft_connlimit *priv_dst = nft_expr_priv(dst); > struct nft_connlimit *priv_src = nft_expr_priv(src); > > - spin_lock_init(&priv_dst->lock); > nf_conncount_list_init(&priv_dst->list); > priv_dst->limit = priv_src->limit; > priv_dst->invert = priv_src->invert; > @@ -232,15 +226,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, > static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) > { > struct nft_connlimit *priv = nft_expr_priv(expr); > - bool ret; > > - spin_lock_bh(&priv->lock); > - nf_conncount_gc_list(net, &priv->list); > - > - ret = list_empty(&priv->list.head); > - spin_unlock_bh(&priv->lock); > - > - return ret; > + return nf_conncount_gc_list(net, &priv->list); > } > > static struct nft_expr_type nft_connlimit_type; > -- > 2.7.4 > > -- > To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Florian Westphal <fw@xxxxxxxxx> 4096R/AD5FF600 2015-09-13 Key fingerprint = 80A9 20C5 B203 E069 F586 AE9F 7091 A8D9 AD5F F600 Phone: +49 151 11132303 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html