Re: [PATCH bpf-next] bpf: optimize hashmap lookups when key_size is divisible by 4

Anton Protopopov <aspsk@xxxxxxxxxxxxx> · Sat, 1 Apr 2023 19:25:44 +0000

On Sat, Apr 01, 2023 at 09:20:03AM -0700, Alexei Starovoitov wrote:
> On Sat, Apr 01, 2023 at 10:10:50AM +0000, Anton Protopopov wrote:
> > The BPF hashmap uses the jhash() hash function. There is an optimized version
> > of this hash function which may be used if hash size is a multiple of 4. Apply
> > this optimization to the hashmap in a similar way as it is done in the bloom
> > filter map.
> > 
> > On practice the optimization is only noticeable for smaller key sizes, which,
> > however, is sufficient for many applications. An example is listed in the
> > following table of measurements (a hashmap of 65536 elements was used):
> > 
> >     --------------------------------------------------------------------
> >     | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
> >     --------------------------------------------------------------------
> >     |        4 |      25% |      42.990M |            46.000M |   7.0% |
> >     |        4 |      50% |      37.910M |            39.094M |   3.1% |
> >     |        4 |      75% |      34.486M |            36.124M |   4.7% |
> >     |        4 |     100% |      31.760M |            32.719M |   3.0% |
> >     --------------------------------------------------------------------
> >     |        8 |      25% |      43.855M |            49.626M |  13.2% |
> >     |        8 |      50% |      38.328M |            42.152M |  10.0% |
> >     |        8 |      75% |      34.483M |            38.088M |  10.5% |
> >     |        8 |     100% |      31.306M |            34.686M |  10.8% |
> >     --------------------------------------------------------------------
> >     |       12 |      25% |      38.398M |            43.770M |  14.0% |
> >     |       12 |      50% |      33.336M |            37.712M |  13.1% |
> >     |       12 |      75% |      29.917M |            34.440M |  15.1% |
> >     |       12 |     100% |      27.322M |            30.480M |  11.6% |
> >     --------------------------------------------------------------------
> >     |       16 |      25% |      41.491M |            41.921M |   1.0% |
> >     |       16 |      50% |      36.206M |            36.474M |   0.7% |
> >     |       16 |      75% |      32.529M |            33.027M |   1.5% |
> >     |       16 |     100% |      29.581M |            30.325M |   2.5% |
> >     --------------------------------------------------------------------
> >     |       20 |      25% |      34.240M |            36.787M |   7.4% |
> >     |       20 |      50% |      30.328M |            32.663M |   7.7% |
> >     |       20 |      75% |      27.536M |            29.354M |   6.6% |
> >     |       20 |     100% |      24.847M |            26.505M |   6.7% |
> >     --------------------------------------------------------------------
> >     |       24 |      25% |      36.329M |            40.608M |  11.8% |
> >     |       24 |      50% |      31.444M |            35.059M |  11.5% |
> >     |       24 |      75% |      28.426M |            31.452M |  10.6% |
> >     |       24 |     100% |      26.278M |            28.741M |   9.4% |
> >     --------------------------------------------------------------------
> >     |       28 |      25% |      31.540M |            31.944M |   1.3% |
> >     |       28 |      50% |      27.739M |            28.063M |   1.2% |
> >     |       28 |      75% |      24.993M |            25.814M |   3.3% |
> >     |       28 |     100% |      23.513M |            23.500M |  -0.1% |
> >     --------------------------------------------------------------------
> >     |       32 |      25% |      32.116M |            33.953M |   5.7% |
> >     |       32 |      50% |      28.879M |            29.859M |   3.4% |
> >     |       32 |      75% |      26.227M |            26.948M |   2.7% |
> >     |       32 |     100% |      23.829M |            24.613M |   3.3% |
> >     --------------------------------------------------------------------
> >     |       64 |      25% |      22.535M |            22.554M |   0.1% |
> >     |       64 |      50% |      20.471M |            20.675M |   1.0% |
> >     |       64 |      75% |      19.077M |            19.146M |   0.4% |
> >     |       64 |     100% |      17.710M |            18.131M |   2.4% |
> >     --------------------------------------------------------------------
> > 
> > The following script was used to gather the results (SMT & frequency off):
> > 
> >     cd tools/testing/selftests/bpf
> >     for key_size in 4 8 12 16 20 24 28 32 64; do
> >             for nr_entries in `seq 16384 16384 65536`; do
> >                     fullness=$(printf '%3s' $((nr_entries*100/65536)))
> >                     echo -n "key_size=$key_size: $fullness% full: "
> >                     sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
> >             done
> >             echo
> >     done
> > 
> > Signed-off-by: Anton Protopopov <aspsk@xxxxxxxxxxxxx>
> > ---
> >  kernel/bpf/hashtab.c | 29 ++++++++++++++++++-----------
> >  1 file changed, 18 insertions(+), 11 deletions(-)
> > 
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index 96b645bba3a4..eb804815f7c3 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -103,6 +103,7 @@ struct bpf_htab {
> >  	u32 n_buckets;	/* number of hash buckets */
> >  	u32 elem_size;	/* size of each element in bytes */
> >  	u32 hashrnd;
> > +	u32 key_size_u32;
> >  	struct lock_class_key lockdep_key;
> >  	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> >  };
> > @@ -510,6 +511,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  	else
> >  		htab->elem_size += round_up(htab->map.value_size, 8);
> >  
> > +	/* optimize hash computations if key_size is divisible by 4 */
> > +	if ((attr->key_size & (sizeof(u32) - 1)) == 0)
> > +		htab->key_size_u32 = attr->key_size / sizeof(u32);
> 
> Please use & 3 and / 4.
> sizeof(u32) is not going to change.

Yes, sure

> > +
> >  	err = -E2BIG;
> >  	/* prevent zero size kmalloc and check for u32 overflow */
> >  	if (htab->n_buckets == 0 ||
> > @@ -605,9 +610,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  	return ERR_PTR(err);
> >  }
> >  
> > -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> > +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
> >  {
> > -	return jhash(key, key_len, hashrnd);
> > +	if (likely(htab->key_size_u32))
> > +		return jhash2(key, htab->key_size_u32, htab->hashrnd);
> > +	return jhash(key, key_len, htab->hashrnd);
> 
> Could you measure the speed when &3 and /4 is done in the hot path ?
> I would expect the performance to be the same or faster,
> since extra load is gone.

I don't see any visible difference (I've tested "&3 and /4" and "%4==0 and /4"
variants). Do you still prefer division in favor of using htab->key_size_u32?