Re: [PATCH nf] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Sep 28, 2023 at 03:12:44PM +0200, Florian Westphal wrote:
> nft_rbtree_gc_elem() walks back and removes the end interval element that
> comes before the expired element.
> 
> There is a small chance that we've cached this element as 'rbe_ge'.
> If this happens, we hold and test a pointer that has been queued for
> freeing.
> 
> It also causes spurious insertion failures:
> 
> $ cat nft-test.20230921-143934.826.dMBwHt/test-testcases-sets-0044interval_overlap_0.1/testout.log
> Error: Could not process rule: File exists
> add element t s {  0 -  2 }
>                    ^^^^^^
> Failed to insert  0 -  2 given:
> table ip t {
>         set s {
>                 type inet_service
>                 flags interval,timeout
>                 timeout 2s
>                 gc-interval 2s
>         }
> }
> 
> The set (rbtree) is empty. The 'failure' doesn't happen on next attempt.
> 
> Reason is that when we try to insert, the tree may hold an expired
> element that collides with the range we're adding.
> While we do evict/erase this element, we can trip over this check:
> 
> if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
>       return -ENOTEMPTY;
> 
> rbe_ge was erased by the synchronous gc, we should not have done this
> check.  Next attempt won't find it, so retry results in successful
> insertion.
> 
> Restart in-kernel to avoid such spurious errors.
> 
> Such restart are rare, unless userspace intentionally adds very large
> numbers of elements with very short timeouts while setting a huge
> gc interval.
> 
> Even in this case, this cannot loop forever, on each retry an existing
> element has been removed.
> 
> As the caller is holding the transaction mutex, its impossible
> for a second entity to add more expiring elements to the tree.
> 
> After this it also becomes feasible to remove the async gc worker
> and perform all garbage collection from the commit path.
> 
> Signed-off-by: Florian Westphal <fw@xxxxxxxxx>
> ---
>  Changes since v1:
>   - restart entire insertion process in case we remove
>   element that we held as lesser/greater overlap detection
>   edge candidate.

Thanks, I am still looking at finding a way to move this to .commit,
if no better solution, then let's get this in for the next round.

>  net/netfilter/nft_set_rbtree.c | 46 +++++++++++++++++++++-------------
>  1 file changed, 29 insertions(+), 17 deletions(-)
> 
> diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
> index 487572dcd614..2660ceab3759 100644
> --- a/net/netfilter/nft_set_rbtree.c
> +++ b/net/netfilter/nft_set_rbtree.c
> @@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
>  	rb_erase(&rbe->node, &priv->root);
>  }
>  
> -static int nft_rbtree_gc_elem(const struct nft_set *__set,
> -			      struct nft_rbtree *priv,
> -			      struct nft_rbtree_elem *rbe,
> -			      u8 genmask)
> +static const struct nft_rbtree_elem *
> +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
> +		   struct nft_rbtree_elem *rbe, u8 genmask)
>  {
>  	struct nft_set *set = (struct nft_set *)__set;
>  	struct rb_node *prev = rb_prev(&rbe->node);
> @@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
>  
>  	gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
>  	if (!gc)
> -		return -ENOMEM;
> +		return ERR_PTR(-ENOMEM);
>  
>  	/* search for end interval coming before this element.
>  	 * end intervals don't carry a timeout extension, they
> @@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
>  		prev = rb_prev(prev);
>  	}
>  
> +	rbe_prev = NULL;
>  	if (prev) {
>  		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
>  		nft_rbtree_gc_remove(net, set, priv, rbe_prev);
> @@ -272,7 +272,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
>  		 */
>  		gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
>  		if (WARN_ON_ONCE(!gc))
> -			return -ENOMEM;
> +			return ERR_PTR(-ENOMEM);
>  
>  		nft_trans_gc_elem_add(gc, rbe_prev);
>  	}
> @@ -280,13 +280,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
>  	nft_rbtree_gc_remove(net, set, priv, rbe);
>  	gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
>  	if (WARN_ON_ONCE(!gc))
> -		return -ENOMEM;
> +		return ERR_PTR(-ENOMEM);
>  
>  	nft_trans_gc_elem_add(gc, rbe);
>  
>  	nft_trans_gc_queue_sync_done(gc);
>  
> -	return 0;
> +	return rbe_prev;
>  }
>  
>  static bool nft_rbtree_update_first(const struct nft_set *set,
> @@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
>  	struct nft_rbtree *priv = nft_set_priv(set);
>  	u8 cur_genmask = nft_genmask_cur(net);
>  	u8 genmask = nft_genmask_next(net);
> -	int d, err;
> +	int d;
>  
>  	/* Descend the tree to search for an existing element greater than the
>  	 * key value to insert that is greater than the new element. This is the
> @@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
>  		 */
>  		if (nft_set_elem_expired(&rbe->ext) &&
>  		    nft_set_elem_active(&rbe->ext, cur_genmask)) {
> -			err = nft_rbtree_gc_elem(set, priv, rbe, genmask);
> -			if (err < 0)
> -				return err;
> +			const struct nft_rbtree_elem *removed_end;
> +
> +			removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask);
> +			if (IS_ERR(removed_end))
> +				return PTR_ERR(removed_end);
> +
> +			if (removed_end == rbe_le || removed_end == rbe_ge)
> +				return -EAGAIN;
>  
>  			continue;
>  		}
> @@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
>  	struct nft_rbtree_elem *rbe = elem->priv;
>  	int err;
>  
> -	write_lock_bh(&priv->lock);
> -	write_seqcount_begin(&priv->count);
> -	err = __nft_rbtree_insert(net, set, rbe, ext);
> -	write_seqcount_end(&priv->count);
> -	write_unlock_bh(&priv->lock);
> +	do {
> +		if (fatal_signal_pending(current))
> +			return -EINTR;
> +
> +		cond_resched();
> +
> +		write_lock_bh(&priv->lock);
> +		write_seqcount_begin(&priv->count);
> +		err = __nft_rbtree_insert(net, set, rbe, ext);
> +		write_seqcount_end(&priv->count);
> +		write_unlock_bh(&priv->lock);
> +	} while (err == -EAGAIN);
>  
>  	return err;
>  }
> -- 
> 2.41.0
> 



[Index of Archives]     [Netfitler Users]     [Berkeley Packet Filter]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux