On Mon, 12 Feb 2024 11:01:53 +0100 Florian Westphal <fw@xxxxxxxxx> wrote: > Insertions into the set are slow when we try to add many elements. > For 800k elements I get: > > time nft -f pipapo_800k > real 19m34.849s > user 0m2.390s > sys 19m12.828s Whoops. > perf stats: > --95.39%--nft_pipapo_insert > |--76.60%--pipapo_insert > | --76.37%--pipapo_resize > | |--72.87%--memcpy_orig > | |--1.88%--__free_pages_ok > | | --0.89%--free_tail_page_prepare > | --1.38%--kvmalloc_node > .. > --18.56%--pipapo_get.isra.0 > |--13.91%--__bitmap_and > |--3.01%--pipapo_refill > |--0.81%--__kmalloc > | --0.74%--__kmalloc_large_node > | --0.66%--__alloc_pages > .. > --0.52%--memset_orig > > So lots of time is spent in copying exising elements to make space for > the next one. > > Instead of allocating to the exact size of the new rule count, allocate > extra slack to reduce alloc/copy/free overhead. > > After: > time nft -f pipapo_800k > real 1m54.110s > user 0m2.515s > sys 1m51.377s That's quite an improvement, thanks for fixing this! > > --80.46%--nft_pipapo_insert > |--73.45%--pipapo_get.isra.0 > |--57.63%--__bitmap_and > | |--8.52%--pipapo_refill > |--3.45%--__kmalloc > | --3.05%--__kmalloc_large_node > | --2.58%--__alloc_pages > --2.59%--memset_orig > |--6.51%--pipapo_insert > --5.96%--pipapo_resize > |--3.63%--memcpy_orig > --2.13%--kvmalloc_node > > The new @rules_alloc fills a hole, so struct size doesn't go up. > Also make it so rule removal doesn't shrink unless the free/extra space > exceeds two pages. This should be safe as well: > > When a rule gets removed, the attempt to lower the allocated size is > already allowed to fail. > > Exception: do exact allocations as long as set is very small (less > than one page needed). > > Signed-off-by: Florian Westphal <fw@xxxxxxxxx> > --- > net/netfilter/nft_set_pipapo.c | 80 +++++++++++++++++++++++++++------- > net/netfilter/nft_set_pipapo.h | 2 + > 2 files changed, 67 insertions(+), 15 deletions(-) > > diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c > index a0ddf24a8052..25cdf64a3139 100644 > --- a/net/netfilter/nft_set_pipapo.c > +++ b/net/netfilter/nft_set_pipapo.c > @@ -622,6 +622,62 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, > return &e->priv; > } > > +static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) Nit: /* pipapo_realloc_mt() - Reallocate mapping table if needed upon resize * @f: Field containing mapping table * @old_rules: Amount of existing mapped rules * @rules: Amount of new rules to map * * Return: 0 on success, negative error code on failure. */ static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) > +{ > + union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; > + unsigned int extra = 4096 / sizeof(*new_mt); Shouldn't we actually use PAGE_SIZE? I think the one-page limit is somewhat arbitrary but makes sense, so it should be 64k on e.g. CONFIG_PPC_64K_PAGES=y. > + unsigned int rules_alloc = rules; > + > + might_sleep(); > + > + BUILD_BUG_ON(extra < 32); I'm not entirely sure why this would be a problem. I mean, 'extra' at this point is the number of extra rules, not the amount of extra bytes, right? > + > + if (unlikely(rules == 0)) > + goto out_free; > + > + /* growing and enough space left, no action needed */ > + if (rules > old_rules && f->rules_alloc > rules) > + return 0; > + > + /* downsize and extra slack has not grown too large */ > + if (rules < old_rules) { > + unsigned int remove = f->rules_alloc - rules; > + > + if (remove < (2u * extra)) > + return 0; > + } > + > + /* small sets get precise count, else add extra slack > + * to avoid frequent reallocations. Extra slack is > + * currently one 4k page worth of rules. > + * > + * Use no slack if the set only has a small number > + * of rules. This isn't always true: if we slightly decrease the size of a small mapping table, we might leave some slack, because we might hit the (remove < (2u * extra)) condition above. Is that intended? It doesn't look problematic to me, by the way. > + */ > + if (rules > extra && > + check_add_overflow(rules, extra, &rules_alloc)) > + return -EOVERFLOW; > + > + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL); > + if (!new_mt) > + return -ENOMEM; > + > + if (old_mt) > + memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); > + > + if (rules > old_rules) Nit: curly braces around multi-line block (for consistency). > + memset(new_mt + old_rules, 0, > + (rules - old_rules) * sizeof(*new_mt)); > + > +out_free: > + f->rules_alloc = rules_alloc; > + f->mt = new_mt; > + > + kvfree(old_mt); > + > + return 0; > +} > + > /** > * pipapo_resize() - Resize lookup or mapping table, or both > * @f: Field containing lookup and mapping tables > @@ -637,9 +693,8 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, > static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) > { > long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; > - union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; > unsigned int new_bucket_size, copy; > - int group, bucket; > + int group, bucket, err; > > if (rules >= NFT_PIPAPO_RULE0_MAX) > return -ENOSPC; > @@ -682,16 +737,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns > } > > mt: > - new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); > - if (!new_mt) { > + err = pipapo_realloc_mt(f, old_rules, rules); > + if (err) { > kvfree(new_lt); > - return -ENOMEM; > - } > - > - memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); > - if (rules > old_rules) { > - memset(new_mt + old_rules, 0, > - (rules - old_rules) * sizeof(*new_mt)); > + return err; > } > > if (new_lt) { > @@ -700,9 +749,6 @@ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, uns > kvfree(old_lt); > } > > - f->mt = new_mt; > - kvfree(old_mt); > - > return 0; > } > > @@ -1382,13 +1428,16 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) > src->groups * NFT_PIPAPO_BUCKETS(src->bb)); > > if (src->rules > 0) { > - dst->mt = kvmalloc_array(src->rules, sizeof(*src->mt), GFP_KERNEL); > + dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL); > if (!dst->mt) > goto out_mt; > > memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); > + dst->rules_alloc = src->rules_alloc; > + dst->rules = src->rules; These two, and setting rules_alloc below, shouldn't be needed, because we already copy everything in the source field before the lookup table, above. > } else { > dst->mt = NULL; > + dst->rules_alloc = 0; > } > > src++; > @@ -2203,6 +2252,7 @@ static int nft_pipapo_init(const struct nft_set *set, > > f->bsize = 0; > f->rules = 0; > + f->rules_alloc = 0; > f->lt = NULL; > f->mt = NULL; > } > diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h > index 8d9486ae0c01..bbcac2b38167 100644 > --- a/net/netfilter/nft_set_pipapo.h > +++ b/net/netfilter/nft_set_pipapo.h > @@ -106,6 +106,7 @@ union nft_pipapo_map_bucket { > * struct nft_pipapo_field - Lookup, mapping tables and related data for a field > * @rules: Number of inserted rules > * @bsize: Size of each bucket in lookup table, in longs > + * @rules_alloc Number of allocated rules, always >= rules > * @groups: Amount of bit groups > * @bb: Number of bits grouped together in lookup table buckets > * @lt: Lookup table: 'groups' rows of buckets > @@ -114,6 +115,7 @@ union nft_pipapo_map_bucket { > struct nft_pipapo_field { > unsigned int rules; > unsigned int bsize; > + unsigned int rules_alloc; > u8 groups; > u8 bb; > unsigned long *lt; -- Stefano