On Wed, Nov 13, 2019 at 12:48 PM Björn Töpel <bjorn.topel@xxxxxxxxx> wrote: > > From: Björn Töpel <bjorn.topel@xxxxxxxxx> > > The BPF dispatcher builds on top of the BPF trampoline ideas; > Introduce bpf_arch_text_poke() and (re-)use the BPF JIT generate > code. The dispatcher builds a dispatch table for XDP programs, for > retpoline avoidance. The table is a simple binary search model, so > lookup is O(log n). Here, the dispatch table is limited to four > entries (for laziness reason -- only 1B relative jumps :-P). If the > dispatch table is full, it will fallback to the retpoline path. > > An example: A module/driver allocates a dispatcher. The dispatcher is > shared for all netdevs. Each netdev allocate a slot in the dispatcher > and a BPF program. The netdev then uses the dispatcher to call the > correct program with a direct call (actually a tail-call). > > Signed-off-by: Björn Töpel <bjorn.topel@xxxxxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 96 ++++++++++++++++++ > kernel/bpf/Makefile | 1 + > kernel/bpf/dispatcher.c | 197 ++++++++++++++++++++++++++++++++++++ > 3 files changed, 294 insertions(+) > create mode 100644 kernel/bpf/dispatcher.c > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index 28782a1c386e..d75aebf508b8 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -10,10 +10,12 @@ > #include <linux/if_vlan.h> > #include <linux/bpf.h> > #include <linux/memory.h> > +#include <linux/sort.h> > #include <asm/extable.h> > #include <asm/set_memory.h> > #include <asm/nospec-branch.h> > #include <asm/text-patching.h> > +#include <asm/asm-prototypes.h> > [...] > + > +int arch_prepare_bpf_dispatcher(void *image, struct bpf_prog **progs, > + int num_progs) > +{ > + u64 ips[BPF_DISPATCHER_MAX] = {}; > + u8 *fallback, *prog = image; > + int i, err, cnt = 0; > + > + if (!num_progs || num_progs > BPF_DISPATCHER_MAX) > + return -EINVAL; > + > + for (i = 0; i < num_progs; i++) > + ips[i] = (u64)progs[i]->bpf_func; > + > + EMIT2(0xEB, 5); /* jmp rip+5 (skip retpoline) */ > + fallback = prog; > + err = emit_jmp(&prog, /* jmp retpoline */ > + __x86_indirect_thunk_rdx, prog); > + if (err) > + return err; > + > + sort(&ips[0], num_progs, sizeof(ips[i]), cmp_ips, NULL); nit: sizeof(ips[i]) looks weird... > + return emit_bpf_dispatcher(&prog, 0, num_progs - 1, &ips[0], fallback); > +} > + > struct x64_jit_data { > struct bpf_binary_header *header; > int *addrs; [...] > + > +static int bpf_dispatcher_add_prog(struct bpf_dispatcher *d, > + struct bpf_prog *prog) > +{ > + struct bpf_prog **entry = NULL; > + int i, err = 0; > + > + if (d->num_progs == BPF_DISPATCHER_MAX) > + return err; err == 0, not what you want, probably > + > + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { > + if (!entry && !d->progs[i]) > + entry = &d->progs[i]; > + if (d->progs[i] == prog) > + return err; > + } > + > + prog = bpf_prog_inc(prog); > + if (IS_ERR(prog)) > + return err; > + > + *entry = prog; > + d->num_progs++; > + return err; > +} > + > +static void bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, > + struct bpf_prog *prog) > +{ > + int i; > + > + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { > + if (d->progs[i] == prog) { > + bpf_prog_put(prog); > + d->progs[i] = NULL; > + d->num_progs--; instead of allowing holes, why not swap removed prog with the last on in d->progs? > + break; > + } > + } > +} > + > +int __weak arch_prepare_bpf_dispatcher(void *image, struct bpf_prog **progs, > + int num_ids) > +{ > + return -ENOTSUPP; > +} > + > +/* NB! bpf_dispatcher_update() might free the dispatcher! */ > +static int bpf_dispatcher_update(struct bpf_dispatcher *d) > +{ > + void *old_image = d->image + ((d->selector + 1) & 1) * PAGE_SIZE / 2; > + void *new_image = d->image + (d->selector & 1) * PAGE_SIZE / 2; > + int err; > + > + if (d->num_progs == 0) { > + err = bpf_arch_text_poke(d->func, BPF_MOD_JMP_TO_NOP, > + old_image, NULL); > + bpf_dispatcher_free(d); > + goto out; > + } > + > + err = arch_prepare_bpf_dispatcher(new_image, &d->progs[0], > + d->num_progs); > + if (err) > + goto out; > + > + if (d->selector) > + /* progs already running at this address */ > + err = bpf_arch_text_poke(d->func, BPF_MOD_JMP_TO_JMP, > + old_image, new_image); > + else > + /* first time registering */ > + err = bpf_arch_text_poke(d->func, BPF_MOD_NOP_TO_JMP, > + NULL, new_image); > + > + if (err) > + goto out; > + d->selector++; > + > +out: > + return err; > +} > + > +void bpf_dispatcher_change_prog(void *func, struct bpf_prog *from, > + struct bpf_prog *to) > +{ > + struct bpf_dispatcher *d; > + > + if (!from && !to) > + return; > + > + mutex_lock(&dispatcher_mutex); > + d = bpf_dispatcher_lookup(func); > + if (!d) > + goto out; > + > + if (from) > + bpf_dispatcher_remove_prog(d, from); > + > + if (to) > + bpf_dispatcher_add_prog(d, to); this can fail > + > + WARN_ON(bpf_dispatcher_update(d)); shouldn't dispatcher be removed from the list before freed? It seems like handling dispatches freeing is better done here explicitly (and you won't need to leave a NB remark) > + > +out: > + mutex_unlock(&dispatcher_mutex); > +} > + > +static int __init init_dispatchers(void) > +{ > + int i; > + > + for (i = 0; i < DISPATCHER_TABLE_SIZE; i++) > + INIT_HLIST_HEAD(&dispatcher_table[i]); > + return 0; > +} > +late_initcall(init_dispatchers); > + > +#endif > -- > 2.20.1 >