Re: [PATCH bpf-next v3 3/5] bpf: Inline calls to bpf_loop when callback is known

Joanne Koong <joannelkoong@xxxxxxxxx> · Fri, 3 Jun 2022 17:06:29 -0700

On Fri, Jun 3, 2022 at 11:51 AM Eduard Zingerman <eddyz87@xxxxxxxxx> wrote:
>
> Calls to `bpf_loop` are replaced with direct loops to avoid
> indirection. E.g. the following:
>
>   bpf_loop(10, foo, NULL, 0);
>
> Is replaced by equivalent of the following:
>
>   for (int i = 0; i < 10; ++i)
>     foo(i, NULL);
>
> This transformation could be applied when:
> - callback is known and does not change during program execution;
> - flags passed to `bpf_loop` are always zero.
>
> Inlining logic works as follows:
>
> - During execution simulation function `update_loop_inline_state`
>   tracks the following information for each `bpf_loop` call
>   instruction:
>   - is callback known and constant?
>   - are flags constant and zero?
> - Function `adjust_stack_depth_for_loop_inlining` increases stack
>   depth for functions where `bpf_loop` calls could be inlined. This is
>   needed to spill registers R6, R7 and R8. These registers are used as
>   loop counter, loop maximal bound and callback context parameter;
> - Function `inline_bpf_loop` called from `do_misc_fixups` replaces
>   `bpf_loop` calls fit for inlining with corresponding loop
>   instructions.
>
> Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
> i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.
>
> Signed-off-by: Eduard Zingerman <eddyz87@xxxxxxxxx>
> ---
>  include/linux/bpf_verifier.h |  16 +++
>  kernel/bpf/bpf_iter.c        |   9 +-
>  kernel/bpf/verifier.c        | 199 ++++++++++++++++++++++++++++++++++-
>  3 files changed, 215 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index e8439f6cbe57..80279616a76b 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -20,6 +20,8 @@
>  #define BPF_MAX_VAR_SIZ        (1 << 29)
>  /* size of type_str_buf in bpf_verifier. */
>  #define TYPE_STR_BUF_LEN 64
> +/* Maximum number of loops for bpf_loop */
> +#define BPF_MAX_LOOPS  BIT(23)

Should this be moved to include/linux/bpf.h since
kernel/bpf/bpf_iter.c also uses this? Then we don't need the "#include
<linux/bpf_verifier.h>" in bpf_iter.c

>
>  /* Liveness marks, used for registers and spilled-regs (in stack slots).
>   * Read marks propagate upwards until they find a write mark; they record that
> @@ -344,6 +346,16 @@ struct bpf_verifier_state_list {
>         int miss_cnt, hit_cnt;
>  };
>
> +struct bpf_loop_inline_state {
> +       u32 initialized:1; /* set to true upon first entry */
> +       u32 callback_is_constant:1; /* true if callback function
> +                                    * is the same at each call
> +                                    */
> +       u32 flags_is_zero:1; /* true if flags register is zero at each call */

I think the more straightforward

bool initialized;
bool callback_is_constant;
bool flags_is_zero;

ends up being the same size as using bit fields.

If your intention was to use bit fields to try to minimize the size of
the struct, I think we could do something like:

bool initialized:1;
bool callback_is_constant:1;
bool flags_is_zero:1;
 /* u16 since bpf_subprog_info.stack_depth is u16; we take the
negative of it whenever we use it since the stack grows downwards */
u16 stack_depth;
u32 callback_subprogno;

to make the struct more compact.

Also, as a side-note, I think if we have an "is_valid" field, then we
don't need both the "callback_is_constant" and "flags_is_zero" fields.
If the flags is not zero or the callback subprogno is not the same on
each invocation of the instruction, then we could represent that by
just setting is_valid to false.

> +       u32 callback_subprogno; /* valid when callback_is_constant == 1 */
> +       s32 stack_base; /* stack offset for loop vars */
> +};
> +
>  /* Possible states for alu_state member. */
>  #define BPF_ALU_SANITIZE_SRC           (1U << 0)
>  #define BPF_ALU_SANITIZE_DST           (1U << 1)
> @@ -380,6 +392,10 @@ struct bpf_insn_aux_data {
>         bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>         bool zext_dst; /* this insn zero extends dst reg */
>         u8 alu_state; /* used in combination with alu_limit */
> +       /* if instruction is a call to bpf_loop this field tracks
> +        * the state of the relevant registers to take decision about inlining
> +        */
> +       struct bpf_loop_inline_state loop_inline_state;

Is placing this inside the union in "struct bpf_insn_aux_data" an option?

>
>         /* below fields are initialized once */
>         unsigned int orig_idx; /* original instruction index */
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index d5d96ceca105..cdb898fce118 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -6,6 +6,7 @@
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>  #include <linux/rcupdate_trace.h>
> +#include <linux/bpf_verifier.h>
>
>  struct bpf_iter_target_info {
>         struct list_head list;
> @@ -723,9 +724,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
>         .arg4_type      = ARG_ANYTHING,
>  };
>
> -/* maximum number of loops */
> -#define MAX_LOOPS      BIT(23)
> -
>  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>            u64, flags)
>  {
> @@ -733,9 +731,12 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>         u64 ret;
>         u32 i;
>
> +       /* note: these safety checks are also verified when bpf_loop is inlined,
> +        * be careful to modify this code in sync
> +        */
>         if (flags)
>                 return -EINVAL;
> -       if (nr_loops > MAX_LOOPS)
> +       if (nr_loops > BPF_MAX_LOOPS)
>                 return -E2BIG;
>
>         for (i = 0; i < nr_loops; i++) {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index aedac2ac02b9..27d78fe6c3f9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7103,6 +7103,78 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
>         return -ENOTSUPP;
>  }
>
> +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> +{
> +       return &env->insn_aux_data[env->insn_idx];
> +}
> +
> +static bool fit_for_bpf_loop_inline(struct bpf_insn_aux_data *insn_aux)
> +{
> +       return insn_aux->loop_inline_state.initialized &&
> +               insn_aux->loop_inline_state.flags_is_zero &&
> +               insn_aux->loop_inline_state.callback_is_constant;
> +}
> +
> +/* For all sub-programs in the program (including main) checks

nit: "check" without the extra s :)

> + * insn_aux_data to see if there are bpf_loop calls that require
> + * inlining. If such calls are found subprog stack_depth is increased
> + * by the size of 3 registers. Reserved space would be used in the
> + * do_misc_fixups to spill values of the R6, R7, R8 to use these
> + * registers for loop iteration.
> + */
> +static void adjust_stack_depth_for_loop_inlining(struct bpf_verifier_env *env)
> +{
> +       int i, subprog_end, cur_subprog = 0;
> +       struct bpf_subprog_info *subprogs = env->subprog_info;
nit: I think this needs to be above the "int i, subprog_end..." line
since it's longer

> +       int insn_cnt = env->prog->len;
> +       bool subprog_updated = false;
> +       s32 stack_base;
> +
> +       subprog_end = (env->subprog_cnt > 1
> +                      ? subprogs[cur_subprog + 1].start
> +                      : insn_cnt);
> +       for (i = 0; i < insn_cnt; i++) {
> +               struct bpf_insn_aux_data *aux = &env->insn_aux_data[i];

Maybe this should be "struct bpf_loop_inline_state *inline_state =
&env->insn_aux_data[i].loop_inline_state" since we only use aux for
aux.loop_inline_state

> +
> +               if (fit_for_bpf_loop_inline(aux)) {
> +                       if (!subprog_updated) {
> +                               subprog_updated = true;
> +                               subprogs[cur_subprog].stack_depth += BPF_REG_SIZE * 3;
> +                               stack_base = -subprogs[cur_subprog].stack_depth;
> +                       }
> +                       aux->loop_inline_state.stack_base = stack_base;
> +               }
> +               if (i == subprog_end - 1) {
> +                       subprog_updated = false;
> +                       cur_subprog++;
> +                       if (cur_subprog < env->subprog_cnt)
> +                               subprog_end = subprogs[cur_subprog + 1].start;
> +               }
> +       }
> +
> +       env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;

In the case where a subprogram that is not subprogram 0 is a fit for
the bpf loop inline and thus increases its stack depth, won't
env->prog->aux->stack_depth need to also be updated?

> +}
> +
> +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> +{
> +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +       struct bpf_reg_state *regs = cur_regs(env);
> +       struct bpf_reg_state *flags_reg = &regs[BPF_REG_4];
> +
> +       int flags_is_zero =
> +               register_is_const(flags_reg) && flags_reg->var_off.value == 0;
> +
> +       if (state->initialized) {
> +               state->flags_is_zero &= flags_is_zero;
> +               state->callback_is_constant &= state->callback_subprogno == subprogno;
> +       } else {
> +               state->initialized = 1;
> +               state->callback_is_constant = 1;
> +               state->flags_is_zero = flags_is_zero;
> +               state->callback_subprogno = subprogno;
> +       }
> +}
> +
>  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                              int *insn_idx_p)
>  {
> @@ -7255,6 +7327,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>                 err = check_bpf_snprintf_call(env, regs);
>                 break;
>         case BPF_FUNC_loop:
> +               update_loop_inline_state(env, meta.subprogno);
>                 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
>                                         set_loop_callback_state);
>                 break;
> @@ -7661,11 +7734,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
>         return true;
>  }
>
> -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> -{
> -       return &env->insn_aux_data[env->insn_idx];
> -}
> -
>  enum {
>         REASON_BOUNDS   = -1,
>         REASON_TYPE     = -2,
> @@ -12920,6 +12988,22 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
>         return new_prog;
>  }
>
> +static void adjust_loop_inline_subprogno(struct bpf_verifier_env *env,
> +                                        u32 first_removed,
> +                                        u32 first_remaining)
> +{
> +       int delta = first_remaining - first_removed;
> +
> +       for (int i = 0; i < env->prog->len; ++i) {
> +               struct bpf_loop_inline_state *state =
> +                       &env->insn_aux_data[i].loop_inline_state;
> +
> +               if (state->initialized &&
> +                   state->callback_subprogno >= first_remaining)
> +                       state->callback_subprogno -= delta;
> +       }
> +}
> +
>  static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>                                               u32 off, u32 cnt)
>  {
> @@ -12963,6 +13047,8 @@ static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
>                          * in adjust_btf_func() - no need to adjust
>                          */
>                 }
> +
> +               adjust_loop_inline_subprogno(env, i, j);
>         } else {
>                 /* convert i from "first prog to remove" to "first to adjust" */
>                 if (env->subprog_info[i].start == off)
> @@ -13773,6 +13859,94 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
>         return 0;
>  }
>
> +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
> +                                       int position, u32 *cnt)
> +{
> +       struct bpf_insn_aux_data *aux = &env->insn_aux_data[position];
> +       s32 stack_base = aux->loop_inline_state.stack_base;
> +       s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
> +       s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
> +       s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
> +       int reg_loop_max = BPF_REG_6;
> +       int reg_loop_cnt = BPF_REG_7;
> +       int reg_loop_ctx = BPF_REG_8;
> +
> +       struct bpf_prog *new_prog;
> +       u32 callback_subprogno = aux->loop_inline_state.callback_subprogno;
> +       u32 callback_start;
> +       u32 call_insn_offset;
> +       s32 callback_offset;
> +       struct bpf_insn insn_buf[19];
> +       struct bpf_insn *next = insn_buf;
> +       struct bpf_insn *call, *jump_to_end, *loop_header;
> +       struct bpf_insn *jump_to_header, *loop_exit;
> +
> +       /* Return error and jump to the end of the patch if
> +        * expected number of iterations is too big.  This
> +        * repeats the check done in bpf_loop helper function,
> +        * be careful to modify this code in sync.
> +        */
> +       (*next++) = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
> +       (*next++) = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
> +       jump_to_end = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JA, 0, 0, 0 /* set below */);
> +       /* spill R6, R7, R8 to use these as loop vars */
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
> +       (*next++) = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
> +       /* initialize loop vars */
> +       (*next++) = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
> +       (*next++) = BPF_MOV32_IMM(reg_loop_cnt, 0);
> +       (*next++) = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
> +       /* loop header;
> +        * if reg_loop_cnt >= reg_loop_max skip the loop body
> +        */
> +       loop_header = next;
> +       (*next++) = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max,
> +                               0 /* set below */);
> +       /* callback call */
> +       (*next++) = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
> +       (*next++) = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> +       call = next;
> +       (*next++) = BPF_CALL_REL(0 /* set below after patching */);
> +       /* increment loop counter */
> +       (*next++) = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
> +       /* jump to loop header if callback returned 0 */
> +       jump_to_header = next;
> +       (*next++) = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0 /* set below */);
> +       /* return value of bpf_loop;
> +        * set R0 to the number of iterations
> +        */
> +       loop_exit = next;
> +       (*next++) = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
> +       /* restore original values of R6, R7, R8 */
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
> +       (*next++) = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
> +
> +       *cnt = next - insn_buf;
> +       if (*cnt > ARRAY_SIZE(insn_buf)) {
> +               WARN_ONCE(1, "BUG %s: 'next' exceeds bounds for 'insn_buf'\n",
> +                         __func__);
> +               return NULL;
> +       }
> +       jump_to_end->off = next - jump_to_end - 1;
> +       loop_header->off = loop_exit - loop_header - 1;
> +       jump_to_header->off = loop_header - jump_to_header - 1;
> +
> +       new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
> +       if (!new_prog)
> +               return new_prog;
> +
> +       /* callback start is known only after patching */
> +       callback_start = env->subprog_info[callback_subprogno].start;
> +       call_insn_offset = position + (call - insn_buf);
> +       callback_offset = callback_start - call_insn_offset - 1;
> +       env->prog->insnsi[call_insn_offset].imm = callback_offset;
> +
> +       return new_prog;
> +}
> +
>  /* Do various post-verification rewrites in a single program pass.
>   * These rewrites simplify JIT and interpreter implementations.
>   */
> @@ -14258,6 +14432,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         continue;
>                 }
>
> +               if (insn->imm == BPF_FUNC_loop &&
> +                   fit_for_bpf_loop_inline(&env->insn_aux_data[i + delta])) {
> +                       new_prog = inline_bpf_loop(env, i + delta, &cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta    += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn      = new_prog->insnsi + i + delta;
> +                       continue;
> +               }
> +
>  patch_call_imm:
>                 fn = env->ops->get_func_proto(insn->imm, env->prog);
>                 /* all functions that have prototype and verifier allowed
> @@ -15030,6 +15216,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
>         if (ret == 0)
>                 ret = check_max_stack_depth(env);
>
> +       if (ret == 0)
> +               adjust_stack_depth_for_loop_inlining(env);

Do we need to do this before the check_max_stack_depth() call above
since adjust_stack_depth_for_loop_inlining() adjusts the stack depth?

> +
>         /* instruction rewrites happen after this point */
>         if (is_priv) {
>                 if (ret == 0)
> --
> 2.25.1
>