From: Florian Westphal <fw@xxxxxxxxx> This patch adds list lock to 'struct nf_conncount_list' so that we can alter the lists containing the individual connections without holding the main tree lock. It would be useful when we only need to add/remove to/from a list without allocate/remove a node in the tree. This patch also use RCU for the initial tree search. We also update nft_connlimit accordingly since we longer need to maintain a list lock in nft_connlimit now. Signed-off-by: Florian Westphal <fw@xxxxxxxxx> Signed-off-by: Yi-Hung Wei <yihung.wei@xxxxxxxxx> --- include/net/netfilter/nf_conntrack_count.h | 3 +- net/netfilter/nf_conncount.c | 98 +++++++++++++++++++----------- net/netfilter/nft_connlimit.c | 15 +---- 3 files changed, 66 insertions(+), 50 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index dbec17f674b7..af0c1c95eccd 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -6,6 +6,7 @@ struct nf_conncount_data; struct nf_conncount_list { + spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ }; @@ -32,7 +33,7 @@ bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_gc_list(struct net *net, +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); void nf_conncount_cache_free(struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index e6f854cc268e..760ba24db735 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -47,12 +47,14 @@ struct nf_conncount_tuple { struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; + struct rcu_head rcu_head; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; + struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; @@ -94,21 +96,42 @@ bool nf_conncount_add(struct nf_conncount_list *list, return false; conn->tuple = *tuple; conn->zone = *zone; + spin_lock(&list->list_lock); list_add_tail(&conn->node, &list->head); list->count++; + spin_unlock(&list->list_lock); return true; } EXPORT_SYMBOL_GPL(nf_conncount_add); -static void conn_free(struct nf_conncount_list *list, +static void __conn_free(struct rcu_head *h) +{ + struct nf_conncount_tuple *conn; + + conn = container_of(h, struct nf_conncount_tuple, rcu_head); + kmem_cache_free(conncount_conn_cachep, conn); +} + +static bool conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - if (WARN_ON_ONCE(list->count == 0)) - return; + bool free_entry = false; + + spin_lock(&list->list_lock); + + if (list->count == 0) { + spin_unlock(&list->list_lock); + return free_entry; + } list->count--; - list_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + list_del_rcu(&conn->node); + if (list->count == 0) + free_entry = true; + + spin_unlock(&list->list_lock); + call_rcu(&conn->rcu_head, __conn_free); + return free_entry; } void nf_conncount_lookup(struct net *net, @@ -166,12 +189,14 @@ EXPORT_SYMBOL_GPL(nf_conncount_lookup); void nf_conncount_list_init(struct nf_conncount_list *list) { + spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 1; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -void nf_conncount_gc_list(struct net *net, +/* Return true if the list is empty */ +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; @@ -182,7 +207,8 @@ void nf_conncount_gc_list(struct net *net, list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found == NULL) { - conn_free(list, conn); + if(conn_free(list, conn)) + return true; collected++; continue; } @@ -194,18 +220,28 @@ void nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - conn_free(list, conn); + if (conn_free(list, conn)) + return true; collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) - return; + return false; } + return false; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); +static void __tree_nodes_free(struct rcu_head *h) +{ + struct nf_conncount_rb *rbconn; + + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); + kmem_cache_free(conncount_rb_cachep, rbconn); +} + static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -215,7 +251,7 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; rb_erase(&rbconn->node, root); - kmem_cache_free(conncount_rb_cachep, rbconn); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); } } @@ -293,62 +329,59 @@ count_tree(struct net *net, { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_root *root; - struct rb_node **rbnode, *parent; + struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int gc_count, hash; bool no_gc = false; - unsigned int count = 0; u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); restart: gc_count = 0; - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { + parent = rcu_dereference_raw(root->rb_node); + while (parent) { int diff; bool addit; - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + rbconn = rb_entry(parent, struct nf_conncount_rb, node); - parent = *rbnode; diff = key_diff(key, rbconn->key, keylen); if (diff < 0) { - rbnode = &((*rbnode)->rb_left); + parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); + parent = rcu_dereference_raw(parent->rb_right); } else { /* same source network -> be counted! */ nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); - count = rbconn->list.count; + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); tree_nodes_free(root, gc_nodes, gc_count); + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + if (!addit) - goto out_unlock; + return rbconn->list.count; if (!nf_conncount_add(&rbconn->list, tuple, zone)) - count = 0; /* hotdrop */ - goto out_unlock; + return 0; /* hotdrop */ - count++; - goto out_unlock; + return rbconn->list.count; } if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) continue; - nf_conncount_gc_list(net, &rbconn->list); - if (list_empty(&rbconn->list.head)) + if (nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { no_gc = true; + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); tree_nodes_free(root, gc_nodes, gc_count); + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); /* tree_node_free before new allocation permits * allocator to re-use newly free'd object. * @@ -358,20 +391,15 @@ count_tree(struct net *net, goto restart; } - count = 0; if (!tuple) - goto out_unlock; + return 0; - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); return insert_tree(root, hash, key, keylen, tuple, zone); - -out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - return count; } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. + * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 37c52ae06741..3dbc737915da 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,7 +14,6 @@ #include <net/netfilter/nf_conntrack_zones.h> struct nft_connlimit { - spinlock_t lock; struct nf_conncount_list list; u32 limit; bool invert; @@ -45,7 +44,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - spin_lock_bh(&priv->lock); nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, &addit); count = priv->list.count; @@ -55,12 +53,10 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; - spin_unlock_bh(&priv->lock); return; } count++; out: - spin_unlock_bh(&priv->lock); if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; @@ -88,7 +84,6 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - spin_lock_init(&priv->lock); nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -213,7 +208,6 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - spin_lock_init(&priv_dst->lock); nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -232,15 +226,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - spin_lock_bh(&priv->lock); - nf_conncount_gc_list(net, &priv->list); - - ret = list_empty(&priv->list.head); - spin_unlock_bh(&priv->lock); - - return ret; + return nf_conncount_gc_list(net, &priv->list); } static struct nft_expr_type nft_connlimit_type; -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html