On Fri, Jun 3, 2022 at 7:11 AM Eduard Zingerman <eddyz87@xxxxxxxxx> wrote: > [...] > > Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on > i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op. > > Signed-off-by: Eduard Zingerman <eddyz87@xxxxxxxxx> > --- [...] > > -/* maximum number of loops */ > -#define MAX_LOOPS BIT(23) > - > BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, > u64, flags) > { > @@ -733,9 +731,12 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, > u64 ret; > u32 i; > > + /* note: these safety checks are also verified when bpf_loop is inlined, > + * be careful to modify this code in sync Let's call out inline_bpf_loop() here to be more clear. [...] > Let's also call out here this the inlined version of bpf_iter.c:bpf_loop > +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, > + int position, u32 *cnt) > +{ > + struct bpf_insn_aux_data *aux = &env->insn_aux_data[position]; > + s32 stack_base = aux->loop_inline_state.stack_base; > + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; > + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; > + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; > + int reg_loop_max = BPF_REG_6; > + int reg_loop_cnt = BPF_REG_7; > + int reg_loop_ctx = BPF_REG_8; > + > + struct bpf_prog *new_prog; > + u32 callback_subprogno = aux->loop_inline_state.callback_subprogno; > + u32 callback_start; > + u32 call_insn_offset; > + s32 callback_offset; > + struct bpf_insn insn_buf[19]; > + struct bpf_insn *next = insn_buf; > + struct bpf_insn *call, *jump_to_end, *loop_header; > + struct bpf_insn *jump_to_header, *loop_exit; > + > + /* Return error and jump to the end of the patch if > + * expected number of iterations is too big. This > + * repeats the check done in bpf_loop helper function, > + * be careful to modify this code in sync. > + */ > + (*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2); > + (*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG); > + jump_to_end = next; > + (*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */); > + /* spill R6, R7, R8 to use these as loop vars */ > + (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset); > + (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset); > + (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset); > + /* initialize loop vars */ > + (*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1); > + (*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0); > + (*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3); > + /* loop header; > + * if reg_loop_cnt >= reg_loop_max skip the loop body > + */ > + loop_header = next; > + (*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, > + 0 /* set below */); > + /* callback call */ > + (*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); > + (*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); > + call = next; > + (*next++) = BPF_CALL_REL(0 /* set below after patching */); > + /* increment loop counter */ > + (*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); > + /* jump to loop header if callback returned 0 */ > + jump_to_header = next; > + (*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */); > + /* return value of bpf_loop; > + * set R0 to the number of iterations > + */ > + loop_exit = next; > + (*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt); > + /* restore original values of R6, R7, R8 */ > + (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset); > + (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset); > + (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset); > + > + *cnt = next - insn_buf; > + if (*cnt > ARRAY_SIZE(insn_buf)) { > + WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n", > + __func__); > + return NULL; > + } > + jump_to_end->off = next - jump_to_end - 1; > + loop_header->off = loop_exit - loop_header - 1; > + jump_to_header->off = loop_header - jump_to_header - 1; I know these changes are made based on my feedback on v1. But it seems v1 is actually cleaner. How about we use v1, but add comments on offsets that need to redo when we make changes to insn_buf[]? > + > + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); > + if (!new_prog) > + return new_prog; > + > + /* callback start is known only after patching */ > + callback_start = env->subprog_info[callback_subprogno].start; > + call_insn_offset = position + (call - insn_buf); > + callback_offset = callback_start - call_insn_offset - 1; > + env->prog->insnsi[call_insn_offset].imm = callback_offset; > + > + return new_prog; > +} > + > /* Do various post-verification rewrites in a single program pass. > * These rewrites simplify JIT and interpreter implementations. > */ > @@ -14258,6 +14432,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > continue; > } > > + if (insn->imm == BPF_FUNC_loop && > + fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) { > + new_prog = inline_bpf_loop(env, i + delta, &cnt); > + if (!new_prog) > + return -ENOMEM; > + > + delta += cnt - 1; > + env->prog = prog = new_prog; > + insn = new_prog->insnsi + i + delta; > + continue; > + } > + > patch_call_imm: > fn = env->ops->get_func_proto(insn->imm, env->prog); > /* all functions that have prototype and verifier allowed > @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) > if (ret == 0) > ret = check_max_stack_depth(env); > > + if (ret == 0) > + adjust_stack_depth_for_loop_inlining(env); > + > /* instruction rewrites happen after this point */ > if (is_priv) { > if (ret == 0) > -- > 2.25.1 >