[PATCH bpf-next 3/3] selftest/bpf/benchs: add bpf_for_each benchmark

Joanne Koong <joannekoong@xxxxxx> · Wed, 17 Nov 2021 17:04:04 -0800

Add benchmark to measure the overhead of the bpf_for_each call
for a specified number of iterations.

Testing this on qemu on my dev machine on 1 thread, the data is
as follows:

        nr_iterations: 1
bpf_for_each helper - total callbacks called:  42.949 ± 1.404M/s

        nr_iterations: 10
bpf_for_each helper - total callbacks called:  73.645 ± 2.077M/s

        nr_iterations: 100
bpf_for_each helper - total callbacks called:  73.058 ± 1.256M/s

        nr_iterations: 500
bpf_for_each helper - total callbacks called:  78.255 ± 2.845M/s

        nr_iterations: 1000
bpf_for_each helper - total callbacks called:  79.439 ± 1.805M/s

        nr_iterations: 5000
bpf_for_each helper - total callbacks called:  81.639 ± 2.053M/s

        nr_iterations: 10000
bpf_for_each helper - total callbacks called:  80.577 ± 1.824M/s

        nr_iterations: 50000
bpf_for_each helper - total callbacks called:  76.773 ± 1.578M/s

        nr_iterations: 100000
bpf_for_each helper - total callbacks called:  77.073 ± 2.200M/s

        nr_iterations: 500000
bpf_for_each helper - total callbacks called:  75.136 ± 0.552M/s

        nr_iterations: 1000000
bpf_for_each helper - total callbacks called:  76.364 ± 1.690M/s

>From this data, we can see that we are able to run the loop at
least 40 million times per second on an empty callback function.

>From this data, we can also see that as the number of iterations
increases, the overhead per iteration decreases and steadies towards
a constant value.

Signed-off-by: Joanne Koong <joannekoong@xxxxxx>
---
 tools/testing/selftests/bpf/Makefile          |   3 +-
 tools/testing/selftests/bpf/bench.c           |   4 +
 .../selftests/bpf/benchs/bench_for_each.c     | 105 ++++++++++++++++++
 .../bpf/benchs/run_bench_for_each.sh          |  16 +++
 .../selftests/bpf/progs/for_each_helper.c     |  13 +++
 5 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_for_each.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_for_each.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f49cb5fc85af..b55fc72b8ef0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -537,7 +537,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
 		 $(OUTPUT)/bench_rename.o \
 		 $(OUTPUT)/bench_trigger.o \
 		 $(OUTPUT)/bench_ringbufs.o \
-		 $(OUTPUT)/bench_bloom_filter_map.o
+		 $(OUTPUT)/bench_bloom_filter_map.o \
+		 $(OUTPUT)/bench_for_each.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index cc4722f693e9..d8b3d537a700 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -171,10 +171,12 @@ static const struct argp_option opts[] = {
 
 extern struct argp bench_ringbufs_argp;
 extern struct argp bench_bloom_map_argp;
+extern struct argp bench_for_each_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
 	{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
+	{ &bench_for_each_argp, 0, "bpf_for_each helper benchmark", 0 },
 	{},
 };
 
@@ -368,6 +370,7 @@ extern const struct bench bench_bloom_update;
 extern const struct bench bench_bloom_false_positive;
 extern const struct bench bench_hashmap_without_bloom;
 extern const struct bench bench_hashmap_with_bloom;
+extern const struct bench bench_for_each_helper;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -394,6 +397,7 @@ static const struct bench *benchs[] = {
 	&bench_bloom_false_positive,
 	&bench_hashmap_without_bloom,
 	&bench_hashmap_with_bloom,
+	&bench_for_each_helper,
 };
 
 static void setup_benchmark()
diff --git a/tools/testing/selftests/bpf/benchs/bench_for_each.c b/tools/testing/selftests/bpf/benchs/bench_for_each.c
new file mode 100644
index 000000000000..3372d5b7d67b
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_for_each.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <argp.h>
+#include "bench.h"
+#include "for_each_helper.skel.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+	struct for_each_helper *skel;
+} ctx;
+
+static struct {
+	__u32 nr_iters;
+} args = {
+	.nr_iters = 10,
+};
+
+enum {
+	ARG_NR_ITERS = 4000,
+};
+
+static const struct argp_option opts[] = {
+	{ "nr_iters", ARG_NR_ITERS, "nr_iters", 0,
+		"Set number of iterations for the bpf_for_each helper"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_NR_ITERS:
+		args.nr_iters = strtol(arg, NULL, 10);
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+/* exported into benchmark runner */
+const struct argp bench_for_each_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
+static void validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+}
+
+static void *producer(void *input)
+{
+	while (true)
+		/* trigger the bpf program */
+		syscall(__NR_getpgid);
+
+	return NULL;
+}
+
+static void *consumer(void *input)
+{
+	return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+static void setup(void)
+{
+	struct bpf_link *link;
+
+	setup_libbpf();
+
+	ctx.skel = for_each_helper__open_and_load();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	link = bpf_program__attach(ctx.skel->progs.benchmark);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+
+	ctx.skel->bss->nr_iterations = args.nr_iters;
+}
+
+const struct bench bench_for_each_helper = {
+	.name = "for-each-helper",
+	.validate = validate,
+	.setup = setup,
+	.producer_thread = producer,
+	.consumer_thread = consumer,
+	.measure = measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_for_each.sh b/tools/testing/selftests/bpf/benchs/run_bench_for_each.sh
new file mode 100755
index 000000000000..5f11a1ad66d3
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_for_each.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+for t in 1 4 8 12 16; do
+printf "\n"
+for i in 1 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do
+subtitle "nr_iterations: $i, nr_threads: $t"
+	summarize "bpf_for_each helper - total callbacks called: " \
+	    "$($RUN_BENCH -p $t --nr_iters $i for-each-helper)"
+	printf "\n"
+done
+done
diff --git a/tools/testing/selftests/bpf/progs/for_each_helper.c b/tools/testing/selftests/bpf/progs/for_each_helper.c
index 4404d0cb32a6..b95551d99f75 100644
--- a/tools/testing/selftests/bpf/progs/for_each_helper.c
+++ b/tools/testing/selftests/bpf/progs/for_each_helper.c
@@ -14,6 +14,8 @@ struct callback_ctx {
 u32 nr_iterations;
 u32 stop_index = -1;
 
+long hits;
+
 /* Making these global variables so that the userspace program
  * can verify the output through the skeleton
  */
@@ -67,3 +69,14 @@ int prog_invalid_flags(struct __sk_buff *skb)
 
 	return 0;
 }
+
+SEC("fentry/__x64_sys_getpgid")
+int benchmark(void *ctx)
+{
+	for (int i = 0; i < 1000; i++) {
+		bpf_for_each(nr_iterations, empty_callback_fn, NULL, 0);
+
+		__sync_add_and_fetch(&hits, nr_iterations);
+	}
+	return 0;
+}
-- 
2.30.2