> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote: > > On Sun, Aug 2, 2020 at 9:47 PM Song Liu <songliubraving@xxxxxx> wrote: >> >> >>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote: >>> >>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <songliubraving@xxxxxx> wrote: >>>> >>>> Add a benchmark to compare performance of >>>> 1) uprobe; >>>> 2) user program w/o args; >>>> 3) user program w/ args; >>>> 4) user program w/ args on random cpu. >>>> >>> >>> Can you please add it to the existing benchmark runner instead, e.g., >>> along the other bench_trigger benchmarks? No need to re-implement >>> benchmark setup. And also that would also allow to compare existing >>> ways of cheaply triggering a program vs this new _USER program? >> >> Will try. >> >>> >>> If the performance is not significantly better than other ways, do you >>> think it still makes sense to add a new BPF program type? I think >>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very >>> nice, maybe it's possible to add that instead of a new program type? >>> Either way, let's see comparison with other program triggering >>> mechanisms first. >> >> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful. >> But I don't think they can be used instead of user program, for a couple >> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs >> running in the system, so user will have to filter those noise out in >> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT, >> while this feature could be useful in many cases, e.g. get stack trace >> on a given CPU. >> > > Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER > program you've added specifically with that feature in mind. But if > you pin user-space thread on the needed CPU and trigger kprobe/tp, > then you'll get what you want. As for the "noise", see how > bench_trigger() deals with that: it records thread ID and filters > everything not matching. You can do the same with CPU ID. It's not as > automatic as with a special BPF program type, but still pretty simple, > which is why I'm still deciding (for myself) whether USER program type > is necessary :) Here are some bench_trigger numbers: base : 1.698 ± 0.001M/s tp : 1.477 ± 0.001M/s rawtp : 1.567 ± 0.001M/s kprobe : 1.431 ± 0.000M/s fentry : 1.691 ± 0.000M/s fmodret : 1.654 ± 0.000M/s user : 1.253 ± 0.000M/s fentry-on-cpu: 0.022 ± 0.011M/s user-on-cpu: 0.315 ± 0.001M/s The two "on-cpu" tests run the program on a different CPU (see the patch at the end). "user" is about 25% slower than "fentry". I think this is mostly because getpgid() is a faster syscall than bpf(BPF_TEST_RUN). "user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI is way faster than moving the process (via sched_setaffinity). For use cases that we would like to call BPF program on specific CPU, triggering it via IPI is a lot faster. Thanks, Song ========================== 8< ========================== diff --git c/tools/testing/selftests/bpf/bench.c w/tools/testing/selftests/bpf/bench.c index 944ad4721c83c..5394a1d2dfd21 100644 --- c/tools/testing/selftests/bpf/bench.c +++ w/tools/testing/selftests/bpf/bench.c @@ -317,7 +317,10 @@ extern const struct bench bench_trig_tp; extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_fentry_on_cpu; extern const struct bench bench_trig_fmodret; +extern const struct bench bench_trig_user; +extern const struct bench bench_trig_user_on_cpu; extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -338,7 +341,10 @@ static const struct bench *benchs[] = { &bench_trig_rawtp, &bench_trig_kprobe, &bench_trig_fentry, + &bench_trig_fentry_on_cpu, &bench_trig_fmodret, + &bench_trig_user, + &bench_trig_user_on_cpu, &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, @@ -462,4 +468,3 @@ int main(int argc, char **argv) return 0; } - diff --git c/tools/testing/selftests/bpf/benchs/bench_trigger.c w/tools/testing/selftests/bpf/benchs/bench_trigger.c index 49c22832f2169..a1ebaebf6070c 100644 --- c/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ w/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include <sched.h> #include "bench.h" #include "trigger_bench.skel.h" @@ -39,6 +41,22 @@ static void *trigger_producer(void *input) return NULL; } +static void *trigger_on_cpu_producer(void *input) +{ + cpu_set_t set; + int i = 0, nr_cpu; + + nr_cpu = libbpf_num_possible_cpus(); + while (true) { + CPU_ZERO(&set); + CPU_SET(i, &set); + sched_setaffinity(0, sizeof(set), &set); + (void)syscall(__NR_getpgid); + i = (i + 1) % nr_cpu; + } + return NULL; +} + static void trigger_measure(struct bench_res *res) { res->hits = atomic_swap(&ctx.skel->bss->hits, 0); @@ -96,6 +114,39 @@ static void trigger_fmodret_setup() attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } +static void trigger_user_setup() +{ + setup_ctx(); +} + +static void *trigger_producer_user(void *input) +{ + struct bpf_prog_test_run_attr attr = {}; + + attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user); + + while (true) + (void)bpf_prog_test_run_xattr(&attr); + return NULL; +} + +static void *trigger_producer_user_on_cpu(void *input) +{ + struct bpf_prog_test_run_attr attr = {}; + int i = 0, nr_cpu; + + nr_cpu = libbpf_num_possible_cpus(); + + attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user); + + while (true) { + attr.cpu_plus = i + 1; + (void)bpf_prog_test_run_xattr(&attr); + i = (i + 1) % nr_cpu; + } + return NULL; +} + static void *trigger_consumer(void *input) { return NULL; @@ -155,6 +206,17 @@ const struct bench bench_trig_fentry = { .report_final = hits_drops_report_final, }; +const struct bench bench_trig_fentry_on_cpu = { + .name = "trig-fentry-on-cpu", + .validate = trigger_validate, + .setup = trigger_fentry_setup, + .producer_thread = trigger_on_cpu_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + const struct bench bench_trig_fmodret = { .name = "trig-fmodret", .validate = trigger_validate, @@ -165,3 +227,25 @@ const struct bench bench_trig_fmodret = { .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; + +const struct bench bench_trig_user = { + .name = "trig-user", + .validate = trigger_validate, + .setup = trigger_user_setup, + .producer_thread = trigger_producer_user, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_user_on_cpu = { + .name = "trig-user-on-cpu", + .validate = trigger_validate, + .setup = trigger_user_setup, + .producer_thread = trigger_producer_user_on_cpu, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f2432946..f10b7aea76aa3 100755 --- c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret +for i in base tp rawtp kprobe fentry fmodret user fentry-on-cpu user-on-cpu do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-10s: %s\n" $i "$summary" diff --git c/tools/testing/selftests/bpf/progs/trigger_bench.c w/tools/testing/selftests/bpf/progs/trigger_bench.c index 8b36b6640e7e9..a6ac11e68d287 100644 --- c/tools/testing/selftests/bpf/progs/trigger_bench.c +++ w/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -45,3 +45,10 @@ int bench_trigger_fmodret(void *ctx) __sync_add_and_fetch(&hits, 1); return -22; } + +SEC("user") +int BPF_PROG(bench_trigger_user) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} ~