Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> writes: > On Mon, Oct 07, 2019 at 07:20:36PM +0200, Toke Høiland-Jørgensen wrote: >> From: Toke Høiland-Jørgensen <toke@xxxxxxxxxx> >> >> This adds support for wrapping eBPF program dispatch in chain calling >> logic. The code injection is controlled by a flag at program load time; if >> the flag is set, the BPF program will carry a flag bit that changes the >> program dispatch logic to wrap it in a chain call loop. >> >> Ideally, it shouldn't be necessary to set the flag on program load time, >> but rather inject the calls when a chain call program is first loaded. The >> allocation logic sets the whole of struct bpf_prog to be read-only memory, >> so it can't immediately be modified, but conceivably we could just unlock >> the first page of the struct and flip the bit when a chain call program is >> first attached. >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@xxxxxxxxxx> >> --- >> include/linux/bpf.h | 3 +++ >> include/linux/filter.h | 34 ++++++++++++++++++++++++++++++++-- >> include/uapi/linux/bpf.h | 6 ++++++ >> kernel/bpf/core.c | 6 ++++++ >> kernel/bpf/syscall.c | 4 +++- >> 5 files changed, 50 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 5b9d22338606..13e5f38cf5c6 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -365,6 +365,8 @@ struct bpf_prog_stats { >> struct u64_stats_sync syncp; >> }; >> >> +#define BPF_NUM_CHAIN_SLOTS 8 >> + >> struct bpf_prog_aux { >> atomic_t refcnt; >> u32 used_map_cnt; >> @@ -383,6 +385,7 @@ struct bpf_prog_aux { >> struct list_head ksym_lnode; >> const struct bpf_prog_ops *ops; >> struct bpf_map **used_maps; >> + struct bpf_prog *chain_progs[BPF_NUM_CHAIN_SLOTS]; >> struct bpf_prog *prog; >> struct user_struct *user; >> u64 load_time; /* ns since boottime */ >> diff --git a/include/linux/filter.h b/include/linux/filter.h >> index 2ce57645f3cd..3d1e4991e61d 100644 >> --- a/include/linux/filter.h >> +++ b/include/linux/filter.h >> @@ -21,6 +21,7 @@ >> #include <linux/kallsyms.h> >> #include <linux/if_vlan.h> >> #include <linux/vmalloc.h> >> +#include <linux/nospec.h> >> >> #include <net/sch_generic.h> >> >> @@ -528,6 +529,7 @@ struct bpf_prog { >> is_func:1, /* program is a bpf function */ >> kprobe_override:1, /* Do we override a kprobe? */ >> has_callchain_buf:1, /* callchain buffer allocated? */ >> + chain_calls:1, /* should this use the chain_call wrapper */ >> enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ >> enum bpf_prog_type type; /* Type of BPF program */ >> enum bpf_attach_type expected_attach_type; /* For some prog types */ >> @@ -551,6 +553,30 @@ struct sk_filter { >> struct bpf_prog *prog; >> }; >> >> +#define BPF_MAX_CHAIN_CALLS 32 >> +static __always_inline unsigned int do_chain_calls(const struct bpf_prog *prog, >> + const void *ctx) >> +{ >> + int i = BPF_MAX_CHAIN_CALLS; >> + int idx; >> + u32 ret; >> + >> + do { >> + ret = (*(prog)->bpf_func)(ctx, prog->insnsi); > > This breaks program stats. Oh, right, silly me. Will fix. >> + >> + if (ret + 1 >= BPF_NUM_CHAIN_SLOTS) { >> + prog = prog->aux->chain_progs[0]; >> + continue; >> + } >> + idx = ret + 1; >> + idx = array_index_nospec(idx, BPF_NUM_CHAIN_SLOTS); >> + >> + prog = prog->aux->chain_progs[idx] ?: prog->aux->chain_progs[0]; >> + } while (prog && --i); >> + >> + return ret; >> +} >> + >> DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); >> >> #define BPF_PROG_RUN(prog, ctx) ({ \ >> @@ -559,14 +585,18 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); >> if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ >> struct bpf_prog_stats *stats; \ >> u64 start = sched_clock(); \ >> - ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ >> + ret = prog->chain_calls ? \ >> + do_chain_calls(prog, ctx) : \ >> + (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ > > I thought you agreed on 'no performance regressions' rule? As I wrote in the cover letter I could not measurable a performance impact from this, even with the simplest possible XDP program (where program setup time has the largest impact). This was the performance before/after patch (also in the cover letter): Before patch (XDP DROP program): 31.5 Mpps After patch (XDP DROP program): 32.0 Mpps So actually this *increases* performance ;) (Or rather, the difference is within the measurement uncertainty on my system). -Toke