Re: [PATCH bpf-next 2/2] selftests/bpf: Add benchmark for bpf memory allocator

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Dec 21, 2023 at 6:14 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>
> From: Hou Tao <houtao1@xxxxxxxxxx>
>
[...]
>
> The following is the test results conducted on a 8-CPU VM with 16GB memory:
>
> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a; done |grep Summary
> Summary: per-prod alloc 11.29 ± 0.14M/s free 33.76 ± 0.33M/s, total memory usage    0.01 ± 0.00MiB
> Summary: per-prod alloc  7.49 ± 0.12M/s free 34.42 ± 0.56M/s, total memory usage    0.03 ± 0.00MiB
> Summary: per-prod alloc  6.66 ± 0.08M/s free 34.27 ± 0.41M/s, total memory usage    0.06 ± 0.00MiB
>
> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a --percpu; done |grep Summary
> Summary: per-prod alloc 14.64 ± 0.60M/s free 36.94 ± 0.35M/s, total memory usage  188.02 ± 7.43MiB
> Summary: per-prod alloc 12.39 ± 1.32M/s free 36.40 ± 0.38M/s, total memory usage  808.90 ± 25.56MiB
> Summary: per-prod alloc 10.80 ± 0.17M/s free 35.45 ± 0.25M/s, total memory usage 2330.24 ± 480.56MiB

This is not likely related to this patch, but do we expect this much
memory usage?
I guess the 2.3GiB number is from bigger ALLOC_OBJ_SIZE and
ALLOC_BATCH_CNT? I am getting 0 MiB with this test on my VM.

>
> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +
>  tools/testing/selftests/bpf/bench.c           |   4 +
>  tools/testing/selftests/bpf/bench.h           |   7 +
>  .../selftests/bpf/benchs/bench_bpf_ma.c       | 273 ++++++++++++++++++
>  .../selftests/bpf/progs/bench_bpf_ma.c        | 222 ++++++++++++++

Maybe add a run_bench_bpf_ma.sh script in selftests/bpf/benchs?

>  5 files changed, 508 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_ma.c
>
[...]
> diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
> index a6fcf111221f..206cf3de5df2 100644
> --- a/tools/testing/selftests/bpf/bench.h
> +++ b/tools/testing/selftests/bpf/bench.h
> @@ -53,6 +53,13 @@ struct bench_res {
>                         unsigned long gp_ct;
>                         unsigned int stime;
>                 } rcu;
> +               struct {
> +                       unsigned long alloc;
> +                       unsigned long free;

nit: maybe add _ct or _cnt postfix to match "rcu" above or the skel?

> +                       unsigned long alloc_ns;
> +                       unsigned long free_ns;
> +                       unsigned long mem_bytes;
> +               } ma;
>         };
>  };
>
[...]
> +
> +static void bpf_ma_validate(void)
> +{
> +}

Empty validate() function seems not necessary.

[...]

> +
> +static void bpf_ma_report_final(struct bench_res res[], int res_cnt)
> +{
> +       double mem_mean = 0.0, mem_stddev = 0.0;
> +       double alloc_mean = 0.0, alloc_stddev = 0.0;
> +       double free_mean = 0.0, free_stddev = 0.0;
> +       double alloc_ns = 0.0, free_ns = 0.0;
> +       int i;
> +
> +       for (i = 0; i < res_cnt; i++) {
> +               alloc_ns += res[i].ma.alloc_ns;
> +               free_ns += res[i].ma.free_ns;
> +       }
> +       for (i = 0; i < res_cnt; i++) {
> +               if (alloc_ns)
> +                       alloc_mean += res[i].ma.alloc * 1000.0 / alloc_ns;
> +               if (free_ns)
> +                       free_mean += res[i].ma.free * 1000.0 / free_ns;
> +               mem_mean += res[i].ma.mem_bytes / 1048576.0 / (0.0 + res_cnt);
> +       }
> +       if (res_cnt > 1) {
> +               for (i = 0; i < res_cnt; i++) {
> +                       double sample;
> +
> +                       sample = res[i].ma.alloc_ns ? res[i].ma.alloc * 1000.0 /
> +                                                     res[i].ma.alloc_ns : 0.0;
> +                       alloc_stddev += (alloc_mean - sample) * (alloc_mean - sample) /
> +                                       (res_cnt - 1.0);
> +
> +                       sample = res[i].ma.free_ns ? res[i].ma.free * 1000.0 /
> +                                                    res[i].ma.free_ns : 0.0;
> +                       free_stddev += (free_mean - sample) * (free_mean - sample) /
> +                                      (res_cnt - 1.0);
> +
> +                       sample = res[i].ma.mem_bytes / 1048576.0;
> +                       mem_stddev += (mem_mean - sample) * (mem_mean - sample) /
> +                                     (res_cnt - 1.0);
> +               }

nit: We can probably refactor common code for stddev calculation into
some helpers.

> +               alloc_stddev = sqrt(alloc_stddev);
> +               free_stddev = sqrt(free_stddev);
> +               mem_stddev = sqrt(mem_stddev);
> +       }
> +
> +       printf("Summary: per-prod alloc %7.2lf \u00B1 %3.2lfM/s free %7.2lf \u00B1 %3.2lfM/s, "
> +              "total memory usage %7.2lf \u00B1 %3.2lfMiB\n",
> +              alloc_mean, alloc_stddev, free_mean, free_stddev,
> +              mem_mean, mem_stddev);
> +}
> +
> +const struct bench bench_bpf_mem_alloc = {
> +       .name = "bpf_ma",
> +       .argp = &bench_bpf_mem_alloc_argp,
> +       .validate = bpf_ma_validate,
> +       .setup = bpf_ma_setup,
> +       .producer_thread = bpf_ma_producer,
> +       .measure = bpf_ma_measure,
> +       .report_progress = bpf_ma_report_progress,
> +       .report_final = bpf_ma_report_final,
> +};
> diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_ma.c b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c

[...]

> +
> +/* Return the number of allocated objects */
> +static __always_inline unsigned int batch_alloc(struct bpf_map *map)
> +{
> +       struct bin_data *old, *new;
> +       struct map_value *value;
> +       unsigned int i, key;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               new = bpf_obj_new(typeof(*new));
> +               if (!new)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, new);
> +               if (old)
> +                       bpf_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of freed objects */
> +static __always_inline unsigned int batch_free(struct bpf_map *map)
> +{
> +       struct map_value *value;
> +       unsigned int i, key;
> +       void *old;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, NULL);
> +               if (!old)
> +                       return i;
> +               bpf_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of allocated objects */
> +static __always_inline unsigned int batch_percpu_alloc(struct bpf_map *map)
> +{
> +       struct percpu_bin_data *old, *new;
> +       struct percpu_map_value *value;
> +       unsigned int i, key;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               new = bpf_percpu_obj_new(typeof(*new));
> +               if (!new)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, new);
> +               if (old)
> +                       bpf_percpu_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of freed objects */
> +static __always_inline unsigned int batch_percpu_free(struct bpf_map *map)
> +{
> +       struct percpu_map_value *value;
> +       unsigned int i, key;
> +       void *old;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, NULL);
> +               if (!old)
> +                       return i;
> +               bpf_percpu_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}

nit: These four functions have quite duplicated code. We can probably
refactor them a bit.

> +
> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
> +int bench_batch_alloc_free(void *ctx)
> +{
> +       u64 start, delta;
> +       unsigned int cnt;
> +       void *map;

s/void */struct bpf_map */?

> +       int key;
> +
> +       key = bpf_get_smp_processor_id();
> +       map = bpf_map_lookup_elem((void *)&outer_array, &key);
> +       if (!map)
> +               return 0;
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_alloc(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&alloc_cnt, cnt);
> +       __sync_fetch_and_add(&alloc_ns, delta);
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_free(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&free_cnt, cnt);
> +       __sync_fetch_and_add(&free_ns, delta);
> +
> +       return 0;
> +}
> +
> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
> +int bench_batch_percpu_alloc_free(void *ctx)
> +{
> +       u64 start, delta;
> +       unsigned int cnt;
> +       void *map;

ditto

> +       int key;
> +
> +       key = bpf_get_smp_processor_id();
> +       map = bpf_map_lookup_elem((void *)&percpu_outer_array, &key);
> +       if (!map)
> +               return 0;
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_percpu_alloc(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&alloc_cnt, cnt);
> +       __sync_fetch_and_add(&alloc_ns, delta);
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_percpu_free(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&free_cnt, cnt);
> +       __sync_fetch_and_add(&free_ns, delta);
> +
> +       return 0;
> +}

nit: ditto duplicated code.





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux