On Wed, 18 Jan 2023 12:14:14 +0100 Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote: > ...instead of a tree descent, which became overly complicated in an > attempt to cover cases where expired or inactive elements would affect > comparisons with the new element being inserted. > > Further, it turned out that it's probably impossible to cover all those > cases, as inactive nodes might entirely hide subtrees consisting of a > complete interval plus a node that makes the current insertion not > overlap. > > To speed up the overlap check, descent the tree to find a greater > element that is closer to the key value to insert. Then walk down the > node list for overlap detection. Starting the overlap check from > rb_first() unconditionally is slow, it takes 10 times longer due to the > full linear traversal of the list. > > Moreover, perform garbage collection of expired elements when walking > down the node list to avoid bogus overlap reports. ...I'm quite convinced it's fine to perform the garbage collection *after* we found the first element by descending the tree -- in the worst case we include "too many" elements in the tree walk, but never too little. > For the insertion operation itself, this essentially reverts back to the > implementation before commit 7c84d41416d8 ("netfilter: nft_set_rbtree: > Detect partial overlaps on insertion"), except that cases of complete > overlap are already handled in the overlap detection phase itself, which > slightly simplifies the loop to find the insertion point. > > Based on initial patch from Stefano Brivio, including text from the > original patch description too. > > Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") > Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> Reviewed-by: Stefano Brivio <sbrivio@xxxxxxxxxx> Nits only in case you happen to re-spin this: > --- > v3: simplify selection of first node, I observer long list walk with previous > approach. > > net/netfilter/nft_set_rbtree.c | 297 +++++++++++++++++++-------------- > 1 file changed, 170 insertions(+), 127 deletions(-) > > diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c > index 7325bee7d144..c2d3c2959084 100644 > --- a/net/netfilter/nft_set_rbtree.c > +++ b/net/netfilter/nft_set_rbtree.c > @@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) > return !nft_rbtree_interval_end(rbe); > } > > -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, > - const struct nft_rbtree_elem *interval) > +static int nft_rbtree_cmp(const struct nft_set *set, > + const struct nft_rbtree_elem *e1, > + const struct nft_rbtree_elem *e2) > { > - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; > + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), > + set->klen); > } > > static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, > @@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set > const struct nft_rbtree_elem *rbe, *interval = NULL; > u8 genmask = nft_genmask_cur(net); > const struct rb_node *parent; > - const void *this; > int d; > > parent = rcu_dereference_raw(priv->root.rb_node); > @@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set > > rbe = rb_entry(parent, struct nft_rbtree_elem, node); > > - this = nft_set_ext_key(&rbe->ext); > - d = memcmp(this, key, set->klen); > + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); > if (d < 0) { > parent = rcu_dereference_raw(parent->rb_left); > if (interval && > - nft_rbtree_equal(set, this, interval) && > + !nft_rbtree_cmp(set, rbe, interval) && > nft_rbtree_interval_end(rbe) && > nft_rbtree_interval_start(interval)) > continue; > @@ -215,154 +215,197 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, > return rbe; > } > > +static int nft_rbtree_gc_elem(const struct nft_set *__set, > + struct nft_rbtree *priv, > + struct nft_rbtree_elem *rbe) > +{ > + struct nft_set *set = (struct nft_set *)__set; > + struct rb_node *prev = rb_prev(&rbe->node); > + struct nft_rbtree_elem *rbe_prev; > + struct nft_set_gc_batch *gcb; > + > + gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); > + if (!gcb) > + return -ENOMEM; > + > + /* search for expired end interval coming before this element. */ > + do { > + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); > + if (nft_rbtree_interval_end(rbe_prev)) > + break; > + > + prev = rb_prev(prev); > + } while (prev != NULL); > + > + rb_erase(&rbe_prev->node, &priv->root); > + rb_erase(&rbe->node, &priv->root); > + atomic_sub(2, &set->nelems); > + > + nft_set_gc_batch_add(gcb, rbe); > + nft_set_gc_batch_complete(gcb); > + > + return 0; > +} > + > static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, > struct nft_rbtree_elem *new, > struct nft_set_ext **ext) > { > - bool overlap = false, dup_end_left = false, dup_end_right = false; > + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; > + struct rb_node *node, *parent, **p, *first = NULL; > struct nft_rbtree *priv = nft_set_priv(set); > u8 genmask = nft_genmask_next(net); > - struct nft_rbtree_elem *rbe; > - struct rb_node *parent, **p; > - int d; > + int d, err; > > - /* Detect overlaps as we descend the tree. Set the flag in these cases: > - * > - * a1. _ _ __>| ?_ _ __| (insert end before existing end) > - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) > - * a3. _ _ ___? >|_ _ __| (insert start before existing end) > - * > - * and clear it later on, as we eventually reach the points indicated by > - * '?' above, in the cases described below. We'll always meet these > - * later, locally, due to tree ordering, and overlaps for the intervals > - * that are the closest together are always evaluated last. > - * > - * b1. _ _ __>| !_ _ __| (insert end before existing start) > - * b2. _ _ ___| !_ _ _>| (insert end after existing start) > - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) > - * '--' no nodes falling in this range > - * b4. >|_ _ ! (insert start before existing start) > - * > - * Case a3. resolves to b3.: > - * - if the inserted start element is the leftmost, because the '0' > - * element in the tree serves as end element > - * - otherwise, if an existing end is found immediately to the left. If > - * there are existing nodes in between, we need to further descend the > - * tree before we can conclude the new start isn't causing an overlap > - * > - * or to b4., which, preceded by a3., means we already traversed one or > - * more existing intervals entirely, from the right. > - * > - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., > - * in that order. > - * > - * The flag is also cleared in two special cases: > - * > - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) > - * b6. |__ _ >|!__ _ _ (insert end right after existing start) > - * > - * which always happen as last step and imply that no further > - * overlapping is possible. > - * > - * Another special case comes from the fact that start elements matching > - * an already existing start element are allowed: insertion is not > - * performed but we return -EEXIST in that case, and the error will be > - * cleared by the caller if NLM_F_EXCL is not present in the request. > - * This way, request for insertion of an exact overlap isn't reported as > - * error to userspace if not desired. > - * > - * However, if the existing start matches a pre-existing start, but the > - * end element doesn't match the corresponding pre-existing end element, > - * we need to report a partial overlap. This is a local condition that > - * can be noticed without need for a tracking flag, by checking for a > - * local duplicated end for a corresponding start, from left and right, > - * separately. > + /* Descent the tree to search for an existing element greater than the s/Descent/Descend/ > + * key value to insert that is greater than the new element. This is the > + * first element to walk the ordered elements to find possible overlap. > */ > - > parent = NULL; > p = &priv->root.rb_node; > while (*p != NULL) { > parent = *p; > rbe = rb_entry(parent, struct nft_rbtree_elem, node); > - d = memcmp(nft_set_ext_key(&rbe->ext), > - nft_set_ext_key(&new->ext), > - set->klen); > + d = nft_rbtree_cmp(set, rbe, new); > + > if (d < 0) { > p = &parent->rb_left; > - > - if (nft_rbtree_interval_start(new)) { > - if (nft_rbtree_interval_end(rbe) && > - nft_set_elem_active(&rbe->ext, genmask) && > - !nft_set_elem_expired(&rbe->ext) && !*p) > - overlap = false; > - } else { > - if (dup_end_left && !*p) > - return -ENOTEMPTY; > - > - overlap = nft_rbtree_interval_end(rbe) && > - nft_set_elem_active(&rbe->ext, > - genmask) && > - !nft_set_elem_expired(&rbe->ext); > - > - if (overlap) { > - dup_end_right = true; > - continue; > - } > - } > } else if (d > 0) { > + first = &rbe->node; > p = &parent->rb_right; > - > - if (nft_rbtree_interval_end(new)) { > - if (dup_end_right && !*p) > - return -ENOTEMPTY; > - > - overlap = nft_rbtree_interval_end(rbe) && > - nft_set_elem_active(&rbe->ext, > - genmask) && > - !nft_set_elem_expired(&rbe->ext); > - > - if (overlap) { > - dup_end_left = true; > - continue; > - } > - } else if (nft_set_elem_active(&rbe->ext, genmask) && > - !nft_set_elem_expired(&rbe->ext)) { > - overlap = nft_rbtree_interval_end(rbe); > - } > } else { > - if (nft_rbtree_interval_end(rbe) && > - nft_rbtree_interval_start(new)) { > + if (nft_rbtree_interval_end(rbe)) > p = &parent->rb_left; > - > - if (nft_set_elem_active(&rbe->ext, genmask) && > - !nft_set_elem_expired(&rbe->ext)) > - overlap = false; > - } else if (nft_rbtree_interval_start(rbe) && > - nft_rbtree_interval_end(new)) { > + else > p = &parent->rb_right; > + } > + } > > - if (nft_set_elem_active(&rbe->ext, genmask) && > - !nft_set_elem_expired(&rbe->ext)) > - overlap = false; > - } else if (nft_set_elem_active(&rbe->ext, genmask) && > - !nft_set_elem_expired(&rbe->ext)) { > - *ext = &rbe->ext; > - return -EEXIST; > - } else { > - overlap = false; > - if (nft_rbtree_interval_end(rbe)) > - p = &parent->rb_left; > - else > - p = &parent->rb_right; > + if (!first) > + first = rb_first(&priv->root); > + > + /* Detect overlap by going through the list of valid tree nodes. > + * Values stored in the tree are in reversed order, starting from > + * highest to lowest value. > + */ > + for (node = first; node != NULL; node = rb_next(node)) { > + rbe = rb_entry(node, struct nft_rbtree_elem, node); > + > + if (!nft_set_elem_active(&rbe->ext, genmask)) > + continue; > + > + /* perform garbage collection to avoid bogus overlap reports. */ > + if (nft_set_elem_expired(&rbe->ext)) { > + err = nft_rbtree_gc_elem(set, priv, rbe); > + if (err < 0) > + return err; > + > + continue; > + } > + > + d = nft_rbtree_cmp(set, rbe, new); > + if (d == 0) { > + /* Matching end element: no need to look for an > + * overlapping greater or equal element. > + */ > + if (nft_rbtree_interval_end(rbe)) { > + rbe_le = rbe; > + break; > } > + > + /* first element that is greater or equal to key value. */ > + if (!rbe_ge) { > + rbe_ge = rbe; > + continue; > + } > + > + /* this is a closer more or equal element, update it. */ Perhaps "Another greater or equal element, update the pointer" > + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { > + rbe_ge = rbe; > + continue; > + } > + > + /* element is equal to key value, make sure flags are > + * the same, an existing more or equal start element "greater or equal" > + * must not be replaced by more or equal end element. Same here. > + */ > + if ((nft_rbtree_interval_start(new) && > + nft_rbtree_interval_start(rbe_ge)) || > + (nft_rbtree_interval_end(new) && > + nft_rbtree_interval_end(rbe_ge))) { > + rbe_ge = rbe; > + continue; > + } > + } else if (d > 0) { > + /* annotate element greater than the new element. */ > + rbe_ge = rbe; > + continue; > + } else if (d < 0) { > + /* annotate element less than the new element. */ > + rbe_le = rbe; > + break; > } > + } > > - dup_end_left = dup_end_right = false; > + /* - new start element matching existing start element: full overlap > + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. > + */ > + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && > + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { > + *ext = &rbe_ge->ext; > + return -EEXIST; > } > > - if (overlap) > + /* - new end element matching existing end element: full overlap > + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. > + */ > + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && > + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { > + *ext = &rbe_le->ext; > + return -EEXIST; > + } > + > + /* - new start element with existing closest, less or equal key value > + * being a start element: partial overlap, reported as -ENOTEMPTY. > + * Anonymous sets allow for two consecutive start element since they > + * are constant, skip them to avoid bogus overlap reports. > + */ > + if (!nft_set_is_anonymous(set) && rbe_le && > + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) > + return -ENOTEMPTY; > + > + /* - new end element with existing closest, less or equal key value > + * being a end element: partial overlap, reported as -ENOTEMPTY. > + */ > + if (rbe_le && > + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) > + return -ENOTEMPTY; > + > + /* - new end element with existing closest, greater or equal key value > + * being an end element: partial overlap, reported as -ENOTEMPTY > + */ > + if (rbe_ge && > + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) > return -ENOTEMPTY; > > + /* Accepted element: pick insertion point depending on key value */ > + parent = NULL; > + p = &priv->root.rb_node; > + while (*p != NULL) { > + parent = *p; > + rbe = rb_entry(parent, struct nft_rbtree_elem, node); > + d = nft_rbtree_cmp(set, rbe, new); > + > + if (d < 0) > + p = &parent->rb_left; > + else if (d > 0) > + p = &parent->rb_right; > + else if (nft_rbtree_interval_end(rbe)) > + p = &parent->rb_left; > + else > + p = &parent->rb_right; > + } > + > rb_link_node_rcu(&new->node, parent, p); > rb_insert_color(&new->node, &priv->root); > return 0; -- Stefano