On Fri, Nov 22, 2019 at 12:08 PM Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > > Add initial code emission for *direct* jumps for tail call maps in > order to avoid the retpoline overhead from a493a87f38cf ("bpf, x64: > implement retpoline for tail call") for situations that allow for > it, meaning, for known constant keys at verification time which are > used as index into the tail call map. In case of Cilium which makes > heavy use of tail calls, constant keys are used in the vast majority, > only for a single occurrence we use a dynamic key. > > High level outline is that if the target prog is NULL in the map, we > emit a 5-byte nop for the fall-through case and if not, we emit a > 5-byte direct relative jmp to the target bpf_func + skipped prologue > offset. Later during runtime, we patch these 5-byte nop/jmps upon > tail call map update or deletions dynamically. Note that on x86-64 > the direct jmp works as we reuse the same stack frame and skip > prologue (as opposed to some other JIT implementations). > > One of the issues is that the tail call map slots can change at any > given time even during JITing. Therefore, we have two passes: i) emit > nops for all patchable locations during main JITing phase until we > declare prog->jited = 1 eventually. At this point the image is stable, > not public yet and with all jmps disabled. While JITing, we collect > additional info like poke->ip in order to remember the patch location > for later modifications. In ii) bpf_tail_call_direct_fixup() walks > over the progs poke_tab, locks the tail call maps poke_mutex to > prevent from parallel updates and patches in the right locations via > __bpf_arch_text_poke(). Note, the main bpf_arch_text_poke() cannot > be used at this point since we're not yet exposed to kallsyms. For > the update we use plain memcpy() since the image is not public and > still in read-write mode. After patching, we activate that poke entry > through poke->ip_stable. Meaning, at this point any tail call map > updates/deletions are not going to ignore that poke entry anymore. > Then, bpf_arch_text_poke() might still occur on the read-write image > until we finally locked it as read-only. Both modifications on the > given image are under text_mutex to avoid interference with each > other when update requests come in in parallel for different tail > call maps (current one we have locked in JIT and different one where > poke->ip_stable was already set). > > Example prog: > > # ./bpftool p d x i 1655 > 0: (b7) r3 = 0 > 1: (18) r2 = map[id:526] > 3: (85) call bpf_tail_call#12 > 4: (b7) r0 = 1 > 5: (95) exit > > Before: > > # ./bpftool p d j i 1655 > 0xffffffffc076e55c: > 0: nopl 0x0(%rax,%rax,1) > 5: push %rbp > 6: mov %rsp,%rbp > 9: sub $0x200,%rsp > 10: push %rbx > 11: push %r13 > 13: push %r14 > 15: push %r15 > 17: pushq $0x0 _ > 19: xor %edx,%edx |_ index (arg 3) > 1b: movabs $0xffff88d95cc82600,%rsi |_ map (arg 2) > 25: mov %edx,%edx | index >= array->map.max_entries > 27: cmp %edx,0x24(%rsi) | > 2a: jbe 0x0000000000000066 |_ > 2c: mov -0x224(%rbp),%eax | tail call limit check > 32: cmp $0x20,%eax | > 35: ja 0x0000000000000066 | > 37: add $0x1,%eax | > 3a: mov %eax,-0x224(%rbp) |_ > 40: mov 0xd0(%rsi,%rdx,8),%rax |_ prog = array->ptrs[index] > 48: test %rax,%rax | prog == NULL check > 4b: je 0x0000000000000066 |_ > 4d: mov 0x30(%rax),%rax | goto *(prog->bpf_func + prologue_size) > 51: add $0x19,%rax | > 55: callq 0x0000000000000061 | retpoline for indirect jump > 5a: pause | > 5c: lfence | > 5f: jmp 0x000000000000005a | > 61: mov %rax,(%rsp) | > 65: retq |_ > 66: mov $0x1,%eax > 6b: pop %rbx > 6c: pop %r15 > 6e: pop %r14 > 70: pop %r13 > 72: pop %rbx > 73: leaveq > 74: retq > > After; state after JIT: > > # ./bpftool p d j i 1655 > 0xffffffffc08e8930: > 0: nopl 0x0(%rax,%rax,1) > 5: push %rbp > 6: mov %rsp,%rbp > 9: sub $0x200,%rsp > 10: push %rbx > 11: push %r13 > 13: push %r14 > 15: push %r15 > 17: pushq $0x0 _ > 19: xor %edx,%edx |_ index (arg 3) > 1b: movabs $0xffff9d8afd74c000,%rsi |_ map (arg 2) > 25: mov -0x224(%rbp),%eax | tail call limit check > 2b: cmp $0x20,%eax | > 2e: ja 0x000000000000003e | > 30: add $0x1,%eax | > 33: mov %eax,-0x224(%rbp) |_ > 39: jmpq 0xfffffffffffd1785 |_ [direct] goto *(prog->bpf_func + prologue_size) > 3e: mov $0x1,%eax > 43: pop %rbx > 44: pop %r15 > 46: pop %r14 > 48: pop %r13 > 4a: pop %rbx > 4b: leaveq > 4c: retq > > After; state after map update (target prog): > > # ./bpftool p d j i 1655 > 0xffffffffc08e8930: > 0: nopl 0x0(%rax,%rax,1) > 5: push %rbp > 6: mov %rsp,%rbp > 9: sub $0x200,%rsp > 10: push %rbx > 11: push %r13 > 13: push %r14 > 15: push %r15 > 17: pushq $0x0 > 19: xor %edx,%edx > 1b: movabs $0xffff9d8afd74c000,%rsi > 25: mov -0x224(%rbp),%eax > 2b: cmp $0x20,%eax . > 2e: ja 0x000000000000003e . > 30: add $0x1,%eax . > 33: mov %eax,-0x224(%rbp) |_ > 39: jmpq 0xffffffffffb09f55 |_ goto *(prog->bpf_func + prologue_size) > 3e: mov $0x1,%eax > 43: pop %rbx > 44: pop %r15 > 46: pop %r14 > 48: pop %r13 > 4a: pop %rbx > 4b: leaveq > 4c: retq > > After; state after map update (no prog): > > # ./bpftool p d j i 1655 > 0xffffffffc08e8930: > 0: nopl 0x0(%rax,%rax,1) > 5: push %rbp > 6: mov %rsp,%rbp > 9: sub $0x200,%rsp > 10: push %rbx > 11: push %r13 > 13: push %r14 > 15: push %r15 > 17: pushq $0x0 > 19: xor %edx,%edx > 1b: movabs $0xffff9d8afd74c000,%rsi > 25: mov -0x224(%rbp),%eax > 2b: cmp $0x20,%eax . > 2e: ja 0x000000000000003e . > 30: add $0x1,%eax . > 33: mov %eax,-0x224(%rbp) |_ > 39: nopl 0x0(%rax,%rax,1) |_ fall-through nop > 3e: mov $0x1,%eax > 43: pop %rbx > 44: pop %r15 > 46: pop %r14 > 48: pop %r13 > 4a: pop %rbx > 4b: leaveq > 4c: retq > > Nice bonus is that this also shrinks the code emission quite a bit > for every tail call invocation. > > Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx> > --- > arch/x86/net/bpf_jit_comp.c | 282 ++++++++++++++++++++++++------------ > 1 file changed, 187 insertions(+), 95 deletions(-) > [...] > +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, > + void *old_addr, void *new_addr, > + const bool text_live) > +{ > + int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); > + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; > + u8 old_insn[X86_PATCH_SIZE] = {}; > + u8 new_insn[X86_PATCH_SIZE] = {}; > + u8 *prog; > + int ret; > + > + switch (t) { > + case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: > + emit_patch_fn = emit_call; > + break; > + case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: > + emit_patch_fn = emit_jump; > + break; > + default: > + return -ENOTSUPP; > + } > + > + switch (t) { > + case BPF_MOD_NOP_TO_CALL: > + case BPF_MOD_NOP_TO_JUMP: > + if (!old_addr && new_addr) { > + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); > + > + prog = new_insn; > + ret = emit_patch_fn(&prog, new_addr, ip); > + if (ret) > + return ret; > + break; > + } > + return -ENXIO; > + case BPF_MOD_CALL_TO_CALL: > + case BPF_MOD_JUMP_TO_JUMP: > + if (old_addr && new_addr) { > + prog = old_insn; > + ret = emit_patch_fn(&prog, old_addr, ip); > + if (ret) > + return ret; > + > + prog = new_insn; > + ret = emit_patch_fn(&prog, new_addr, ip); > + if (ret) > + return ret; > + break; > + } > + return -ENXIO; > + case BPF_MOD_CALL_TO_NOP: > + case BPF_MOD_JUMP_TO_NOP: > + if (old_addr && !new_addr) { > + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); > + > + prog = old_insn; > + ret = emit_patch_fn(&prog, old_addr, ip); > + if (ret) > + return ret; > + break; > + } > + return -ENXIO; > + default: There is this redundancy between BPF_MOD_xxx enums and old_addr+new_addr (both encode what kind of transition it is), which leads to this cumbersome logic. Would it be simpler to have old_addr/new_addr determine whether it's X-to-NOP, NOP-to-Y, or X-to-Y transition, while separate bool or simple BPF_MOD_CALL/BPF_MOD_JUMP enum determining whether it's a call or a jump that we want to update. Seems like that should be a simpler interface overall and cleaner implementation? > + return -ENOTSUPP; > + } > + > + ret = -EBUSY; > + mutex_lock(&text_mutex); > + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) > + goto out; > + if (text_live) > + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); > + else > + memcpy(ip, new_insn, X86_PATCH_SIZE); > + ret = 0; > +out: > + mutex_unlock(&text_mutex); > + return ret; > +} > + [...]