On Thu, Dec 21, 2023 at 6:14 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote: > > From: Hou Tao <houtao1@xxxxxxxxxx> > [...] > > The following is the test results conducted on a 8-CPU VM with 16GB memory: > > $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a; done |grep Summary > Summary: per-prod alloc 11.29 ± 0.14M/s free 33.76 ± 0.33M/s, total memory usage 0.01 ± 0.00MiB > Summary: per-prod alloc 7.49 ± 0.12M/s free 34.42 ± 0.56M/s, total memory usage 0.03 ± 0.00MiB > Summary: per-prod alloc 6.66 ± 0.08M/s free 34.27 ± 0.41M/s, total memory usage 0.06 ± 0.00MiB > > $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a --percpu; done |grep Summary > Summary: per-prod alloc 14.64 ± 0.60M/s free 36.94 ± 0.35M/s, total memory usage 188.02 ± 7.43MiB > Summary: per-prod alloc 12.39 ± 1.32M/s free 36.40 ± 0.38M/s, total memory usage 808.90 ± 25.56MiB > Summary: per-prod alloc 10.80 ± 0.17M/s free 35.45 ± 0.25M/s, total memory usage 2330.24 ± 480.56MiB This is not likely related to this patch, but do we expect this much memory usage? I guess the 2.3GiB number is from bigger ALLOC_OBJ_SIZE and ALLOC_BATCH_CNT? I am getting 0 MiB with this test on my VM. > > Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> > --- > tools/testing/selftests/bpf/Makefile | 2 + > tools/testing/selftests/bpf/bench.c | 4 + > tools/testing/selftests/bpf/bench.h | 7 + > .../selftests/bpf/benchs/bench_bpf_ma.c | 273 ++++++++++++++++++ > .../selftests/bpf/progs/bench_bpf_ma.c | 222 ++++++++++++++ Maybe add a run_bench_bpf_ma.sh script in selftests/bpf/benchs? > 5 files changed, 508 insertions(+) > create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_ma.c > create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_ma.c > [...] > diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h > index a6fcf111221f..206cf3de5df2 100644 > --- a/tools/testing/selftests/bpf/bench.h > +++ b/tools/testing/selftests/bpf/bench.h > @@ -53,6 +53,13 @@ struct bench_res { > unsigned long gp_ct; > unsigned int stime; > } rcu; > + struct { > + unsigned long alloc; > + unsigned long free; nit: maybe add _ct or _cnt postfix to match "rcu" above or the skel? > + unsigned long alloc_ns; > + unsigned long free_ns; > + unsigned long mem_bytes; > + } ma; > }; > }; > [...] > + > +static void bpf_ma_validate(void) > +{ > +} Empty validate() function seems not necessary. [...] > + > +static void bpf_ma_report_final(struct bench_res res[], int res_cnt) > +{ > + double mem_mean = 0.0, mem_stddev = 0.0; > + double alloc_mean = 0.0, alloc_stddev = 0.0; > + double free_mean = 0.0, free_stddev = 0.0; > + double alloc_ns = 0.0, free_ns = 0.0; > + int i; > + > + for (i = 0; i < res_cnt; i++) { > + alloc_ns += res[i].ma.alloc_ns; > + free_ns += res[i].ma.free_ns; > + } > + for (i = 0; i < res_cnt; i++) { > + if (alloc_ns) > + alloc_mean += res[i].ma.alloc * 1000.0 / alloc_ns; > + if (free_ns) > + free_mean += res[i].ma.free * 1000.0 / free_ns; > + mem_mean += res[i].ma.mem_bytes / 1048576.0 / (0.0 + res_cnt); > + } > + if (res_cnt > 1) { > + for (i = 0; i < res_cnt; i++) { > + double sample; > + > + sample = res[i].ma.alloc_ns ? res[i].ma.alloc * 1000.0 / > + res[i].ma.alloc_ns : 0.0; > + alloc_stddev += (alloc_mean - sample) * (alloc_mean - sample) / > + (res_cnt - 1.0); > + > + sample = res[i].ma.free_ns ? res[i].ma.free * 1000.0 / > + res[i].ma.free_ns : 0.0; > + free_stddev += (free_mean - sample) * (free_mean - sample) / > + (res_cnt - 1.0); > + > + sample = res[i].ma.mem_bytes / 1048576.0; > + mem_stddev += (mem_mean - sample) * (mem_mean - sample) / > + (res_cnt - 1.0); > + } nit: We can probably refactor common code for stddev calculation into some helpers. > + alloc_stddev = sqrt(alloc_stddev); > + free_stddev = sqrt(free_stddev); > + mem_stddev = sqrt(mem_stddev); > + } > + > + printf("Summary: per-prod alloc %7.2lf \u00B1 %3.2lfM/s free %7.2lf \u00B1 %3.2lfM/s, " > + "total memory usage %7.2lf \u00B1 %3.2lfMiB\n", > + alloc_mean, alloc_stddev, free_mean, free_stddev, > + mem_mean, mem_stddev); > +} > + > +const struct bench bench_bpf_mem_alloc = { > + .name = "bpf_ma", > + .argp = &bench_bpf_mem_alloc_argp, > + .validate = bpf_ma_validate, > + .setup = bpf_ma_setup, > + .producer_thread = bpf_ma_producer, > + .measure = bpf_ma_measure, > + .report_progress = bpf_ma_report_progress, > + .report_final = bpf_ma_report_final, > +}; > diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_ma.c b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c [...] > + > +/* Return the number of allocated objects */ > +static __always_inline unsigned int batch_alloc(struct bpf_map *map) > +{ > + struct bin_data *old, *new; > + struct map_value *value; > + unsigned int i, key; > + > + for (i = 0; i < ALLOC_BATCH_CNT; i++) { > + key = i; > + value = bpf_map_lookup_elem(map, &key); > + if (!value) > + return i; > + > + new = bpf_obj_new(typeof(*new)); > + if (!new) > + return i; > + > + old = bpf_kptr_xchg(&value->data, new); > + if (old) > + bpf_obj_drop(old); > + } > + > + return ALLOC_BATCH_CNT; > +} > + > +/* Return the number of freed objects */ > +static __always_inline unsigned int batch_free(struct bpf_map *map) > +{ > + struct map_value *value; > + unsigned int i, key; > + void *old; > + > + for (i = 0; i < ALLOC_BATCH_CNT; i++) { > + key = i; > + value = bpf_map_lookup_elem(map, &key); > + if (!value) > + return i; > + > + old = bpf_kptr_xchg(&value->data, NULL); > + if (!old) > + return i; > + bpf_obj_drop(old); > + } > + > + return ALLOC_BATCH_CNT; > +} > + > +/* Return the number of allocated objects */ > +static __always_inline unsigned int batch_percpu_alloc(struct bpf_map *map) > +{ > + struct percpu_bin_data *old, *new; > + struct percpu_map_value *value; > + unsigned int i, key; > + > + for (i = 0; i < ALLOC_BATCH_CNT; i++) { > + key = i; > + value = bpf_map_lookup_elem(map, &key); > + if (!value) > + return i; > + > + new = bpf_percpu_obj_new(typeof(*new)); > + if (!new) > + return i; > + > + old = bpf_kptr_xchg(&value->data, new); > + if (old) > + bpf_percpu_obj_drop(old); > + } > + > + return ALLOC_BATCH_CNT; > +} > + > +/* Return the number of freed objects */ > +static __always_inline unsigned int batch_percpu_free(struct bpf_map *map) > +{ > + struct percpu_map_value *value; > + unsigned int i, key; > + void *old; > + > + for (i = 0; i < ALLOC_BATCH_CNT; i++) { > + key = i; > + value = bpf_map_lookup_elem(map, &key); > + if (!value) > + return i; > + > + old = bpf_kptr_xchg(&value->data, NULL); > + if (!old) > + return i; > + bpf_percpu_obj_drop(old); > + } > + > + return ALLOC_BATCH_CNT; > +} nit: These four functions have quite duplicated code. We can probably refactor them a bit. > + > +SEC("?fentry/" SYS_PREFIX "sys_getpgid") > +int bench_batch_alloc_free(void *ctx) > +{ > + u64 start, delta; > + unsigned int cnt; > + void *map; s/void */struct bpf_map */? > + int key; > + > + key = bpf_get_smp_processor_id(); > + map = bpf_map_lookup_elem((void *)&outer_array, &key); > + if (!map) > + return 0; > + > + start = bpf_ktime_get_boot_ns(); > + cnt = batch_alloc(map); > + delta = bpf_ktime_get_boot_ns() - start; > + __sync_fetch_and_add(&alloc_cnt, cnt); > + __sync_fetch_and_add(&alloc_ns, delta); > + > + start = bpf_ktime_get_boot_ns(); > + cnt = batch_free(map); > + delta = bpf_ktime_get_boot_ns() - start; > + __sync_fetch_and_add(&free_cnt, cnt); > + __sync_fetch_and_add(&free_ns, delta); > + > + return 0; > +} > + > +SEC("?fentry/" SYS_PREFIX "sys_getpgid") > +int bench_batch_percpu_alloc_free(void *ctx) > +{ > + u64 start, delta; > + unsigned int cnt; > + void *map; ditto > + int key; > + > + key = bpf_get_smp_processor_id(); > + map = bpf_map_lookup_elem((void *)&percpu_outer_array, &key); > + if (!map) > + return 0; > + > + start = bpf_ktime_get_boot_ns(); > + cnt = batch_percpu_alloc(map); > + delta = bpf_ktime_get_boot_ns() - start; > + __sync_fetch_and_add(&alloc_cnt, cnt); > + __sync_fetch_and_add(&alloc_ns, delta); > + > + start = bpf_ktime_get_boot_ns(); > + cnt = batch_percpu_free(map); > + delta = bpf_ktime_get_boot_ns() - start; > + __sync_fetch_and_add(&free_cnt, cnt); > + __sync_fetch_and_add(&free_ns, delta); > + > + return 0; > +} nit: ditto duplicated code.