Re: [PATCH bpf-next] bpf: optimize hashmap lookups when key_size is divisible by 4

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Apr 01, 2023 at 10:10:50AM +0000, Anton Protopopov wrote:
> The BPF hashmap uses the jhash() hash function. There is an optimized version
> of this hash function which may be used if hash size is a multiple of 4. Apply
> this optimization to the hashmap in a similar way as it is done in the bloom
> filter map.
> 
> On practice the optimization is only noticeable for smaller key sizes, which,
> however, is sufficient for many applications. An example is listed in the
> following table of measurements (a hashmap of 65536 elements was used):
> 
>     --------------------------------------------------------------------
>     | key_size | fullness | lookups /sec | lookups (opt) /sec |   gain |
>     --------------------------------------------------------------------
>     |        4 |      25% |      42.990M |            46.000M |   7.0% |
>     |        4 |      50% |      37.910M |            39.094M |   3.1% |
>     |        4 |      75% |      34.486M |            36.124M |   4.7% |
>     |        4 |     100% |      31.760M |            32.719M |   3.0% |
>     --------------------------------------------------------------------
>     |        8 |      25% |      43.855M |            49.626M |  13.2% |
>     |        8 |      50% |      38.328M |            42.152M |  10.0% |
>     |        8 |      75% |      34.483M |            38.088M |  10.5% |
>     |        8 |     100% |      31.306M |            34.686M |  10.8% |
>     --------------------------------------------------------------------
>     |       12 |      25% |      38.398M |            43.770M |  14.0% |
>     |       12 |      50% |      33.336M |            37.712M |  13.1% |
>     |       12 |      75% |      29.917M |            34.440M |  15.1% |
>     |       12 |     100% |      27.322M |            30.480M |  11.6% |
>     --------------------------------------------------------------------
>     |       16 |      25% |      41.491M |            41.921M |   1.0% |
>     |       16 |      50% |      36.206M |            36.474M |   0.7% |
>     |       16 |      75% |      32.529M |            33.027M |   1.5% |
>     |       16 |     100% |      29.581M |            30.325M |   2.5% |
>     --------------------------------------------------------------------
>     |       20 |      25% |      34.240M |            36.787M |   7.4% |
>     |       20 |      50% |      30.328M |            32.663M |   7.7% |
>     |       20 |      75% |      27.536M |            29.354M |   6.6% |
>     |       20 |     100% |      24.847M |            26.505M |   6.7% |
>     --------------------------------------------------------------------
>     |       24 |      25% |      36.329M |            40.608M |  11.8% |
>     |       24 |      50% |      31.444M |            35.059M |  11.5% |
>     |       24 |      75% |      28.426M |            31.452M |  10.6% |
>     |       24 |     100% |      26.278M |            28.741M |   9.4% |
>     --------------------------------------------------------------------
>     |       28 |      25% |      31.540M |            31.944M |   1.3% |
>     |       28 |      50% |      27.739M |            28.063M |   1.2% |
>     |       28 |      75% |      24.993M |            25.814M |   3.3% |
>     |       28 |     100% |      23.513M |            23.500M |  -0.1% |
>     --------------------------------------------------------------------
>     |       32 |      25% |      32.116M |            33.953M |   5.7% |
>     |       32 |      50% |      28.879M |            29.859M |   3.4% |
>     |       32 |      75% |      26.227M |            26.948M |   2.7% |
>     |       32 |     100% |      23.829M |            24.613M |   3.3% |
>     --------------------------------------------------------------------
>     |       64 |      25% |      22.535M |            22.554M |   0.1% |
>     |       64 |      50% |      20.471M |            20.675M |   1.0% |
>     |       64 |      75% |      19.077M |            19.146M |   0.4% |
>     |       64 |     100% |      17.710M |            18.131M |   2.4% |
>     --------------------------------------------------------------------
> 
> The following script was used to gather the results (SMT & frequency off):
> 
>     cd tools/testing/selftests/bpf
>     for key_size in 4 8 12 16 20 24 28 32 64; do
>             for nr_entries in `seq 16384 16384 65536`; do
>                     fullness=$(printf '%3s' $((nr_entries*100/65536)))
>                     echo -n "key_size=$key_size: $fullness% full: "
>                     sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu
>             done
>             echo
>     done
> 
> Signed-off-by: Anton Protopopov <aspsk@xxxxxxxxxxxxx>
> ---
>  kernel/bpf/hashtab.c | 29 ++++++++++++++++++-----------
>  1 file changed, 18 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 96b645bba3a4..eb804815f7c3 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -103,6 +103,7 @@ struct bpf_htab {
>  	u32 n_buckets;	/* number of hash buckets */
>  	u32 elem_size;	/* size of each element in bytes */
>  	u32 hashrnd;
> +	u32 key_size_u32;
>  	struct lock_class_key lockdep_key;
>  	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
>  };
> @@ -510,6 +511,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	else
>  		htab->elem_size += round_up(htab->map.value_size, 8);
>  
> +	/* optimize hash computations if key_size is divisible by 4 */
> +	if ((attr->key_size & (sizeof(u32) - 1)) == 0)
> +		htab->key_size_u32 = attr->key_size / sizeof(u32);

Please use & 3 and / 4.
sizeof(u32) is not going to change.

> +
>  	err = -E2BIG;
>  	/* prevent zero size kmalloc and check for u32 overflow */
>  	if (htab->n_buckets == 0 ||
> @@ -605,9 +610,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  	return ERR_PTR(err);
>  }
>  
> -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
> +static inline u32 htab_map_hash(const struct bpf_htab *htab, const void *key, u32 key_len)
>  {
> -	return jhash(key, key_len, hashrnd);
> +	if (likely(htab->key_size_u32))
> +		return jhash2(key, htab->key_size_u32, htab->hashrnd);
> +	return jhash(key, key_len, htab->hashrnd);

Could you measure the speed when &3 and /4 is done in the hot path ?
I would expect the performance to be the same or faster,
since extra load is gone.



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux