From: Florian Westphal <fw@xxxxxxxxx> There is no need for asynchronous garbage collection, rbtree inserts can only happen from the netlink control plane. We already perform on-demand gc on insertion, in the area of the tree where the insertion takes place, but we don't do a full tree walk there for performance reasons. Do a full gc walk at the end of the transaction instead and remove the async worker. Signed-off-by: Florian Westphal <fw@xxxxxxxxx> Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- net/netfilter/nft_set_rbtree.c | 124 +++++++++++++++++---------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index d59be2bc6e6c..7d1004f9e7d2 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -19,7 +19,7 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_rwlock_t count; - struct delayed_work gc_work; + unsigned long last_gc; }; struct nft_rbtree_elem { @@ -48,8 +48,7 @@ static int nft_rbtree_cmp(const struct nft_set *set, static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) { - return nft_set_elem_expired(&rbe->ext) || - nft_set_elem_is_dead(&rbe->ext); + return nft_set_elem_expired(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -508,6 +507,15 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, return err; } +static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) +{ + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + rb_erase(&rbe->node, &priv->root); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); +} + static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) @@ -515,11 +523,7 @@ static void nft_rbtree_remove(const struct net *net, struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe = elem->priv; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, @@ -613,45 +617,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, read_unlock_bh(&priv->lock); } -static void nft_rbtree_gc(struct work_struct *work) +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + nft_rbtree_erase(priv, rbe); +} + +static void nft_rbtree_gc(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct nftables_pernet *nft_net; - struct nft_rbtree *priv; + struct rb_node *node, *next; struct nft_trans_gc *gc; - struct rb_node *node; - struct nft_set *set; - unsigned int gc_seq; struct net *net; - priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); nft_net = nft_pernet(net); - gc_seq = READ_ONCE(nft_net->gc_seq); - if (nft_set_gc_is_pending(set)) - goto done; - - gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) - goto done; - - read_lock_bh(&priv->lock); - for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + return; - /* Ruleset has been updated, try later. */ - if (READ_ONCE(nft_net->gc_seq) != gc_seq) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } + for (node = rb_first(&priv->root); node ; node = next) { + next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (nft_set_elem_is_dead(&rbe->ext)) - goto dead_elem; - /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is * always visited before the start element. @@ -663,37 +662,34 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - nft_set_elem_dead(&rbe->ext); - - if (!rbe_end) - continue; - - nft_set_elem_dead(&rbe_end->ext); - - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; - nft_trans_gc_elem_add(gc, rbe_end); - rbe_end = NULL; -dead_elem: - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + /* end element needs to be removed first, it has + * no timeout extension. + */ + if (rbe_end) { + nft_rbtree_gc_remove(net, set, priv, rbe_end); + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; + } + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; + nft_rbtree_gc_remove(net, set, priv, rbe); nft_trans_gc_elem_add(gc, rbe); } - gc = nft_trans_gc_catchall_async(gc, gc_seq); - try_later: - read_unlock_bh(&priv->lock); - if (gc) - nft_trans_gc_queue_async_done(gc); -done: - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -712,11 +708,6 @@ static int nft_rbtree_init(const struct nft_set *set, seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); - return 0; } @@ -727,8 +718,6 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx, struct nft_rbtree_elem *rbe; struct rb_node *node; - cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -754,6 +743,21 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_rbtree_commit(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + nft_rbtree_gc(set); +} + +static void nft_rbtree_gc_init(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -767,6 +771,8 @@ const struct nft_set_type nft_set_rbtree_type = { .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, + .commit = nft_rbtree_commit, + .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, -- 2.30.2