Re: [RFC PATCH bpf-next] bpf: Support shadow stack for bpf progs

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Thu, 13 Jun 2024 17:30:09 -0700

On Sun, Jun 9, 2024 at 10:18 PM Yonghong Song <yonghong.song@xxxxxxxxx> wrote:
>

I think "shadow stack" already has at least two different meanings
in the kernel.
Let's avoid adding 3rd.
How about "divided stack" ?

> +static void emit_percpu_shadow_frame_ptr(u8 **pprog, void *shadow_frame_ptr)
> +{
> +       u8 *prog = *pprog;
> +
> +       /* movabs r9, shadow_frame_ptr */
> +       emit_mov_imm32(&prog, false, X86_REG_R9, (u32) (long) shadow_frame_ptr);
> +
> +       /* add <r9>, gs:[<off>] */
> +       EMIT2(0x65, 0x4c);
> +       EMIT3(0x03, 0x0c, 0x25);
> +       EMIT((u32)(unsigned long)&this_cpu_off, 4);

I think this can be one insn:
lea r9, gs:[(u32)shadow_frame_ptr]

> +       if (stack_depth && enable_shadow_stack) {

I think enabling it for progs with small stack usage
is unnecessary.
The definition of "small" is complicated.
I feel stack_depth <= 64 can stay as-is and
all networking progs don't have to use it either,
since they're called from known places.
While tracing progs can be anywhere, so I'd enable
divided stack for
stack_depth > 64 && prog_type == kprobe, tp, raw_tp, tracing, perf_event.

> +               if (bpf_prog->percpu_shadow_stack_ptr) {
> +                       percpu_shadow_stack_ptr = bpf_prog->percpu_shadow_stack_ptr;
> +               } else {
> +                       percpu_shadow_stack_ptr = __alloc_percpu_gfp(stack_depth, 8, GFP_KERNEL);
> +                       if (!percpu_shadow_stack_ptr)
> +                               return -ENOMEM;
> +                       bpf_prog->percpu_shadow_stack_ptr = percpu_shadow_stack_ptr;
> +               }
> +               shadow_frame_ptr = percpu_shadow_stack_ptr + round_up(stack_depth, 8);
> +               stack_depth = 0;
> +       } else {
> +               enable_shadow_stack = 0;
> +       }
> +
>         arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
>         user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
>
> @@ -1342,7 +1377,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>         /* tail call's presence in current prog implies it is reachable */
>         tail_call_reachable |= tail_call_seen;
>
> -       emit_prologue(&prog, bpf_prog->aux->stack_depth,
> +       emit_prologue(&prog, stack_depth,
>                       bpf_prog_was_classic(bpf_prog), tail_call_reachable,
>                       bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
>         /* Exception callback will clobber callee regs for its own use, and
> @@ -1364,6 +1399,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                 emit_mov_imm64(&prog, X86_REG_R12,
>                                arena_vm_start >> 32, (u32) arena_vm_start);
>
> +       if (enable_shadow_stack)
> +               emit_percpu_shadow_frame_ptr(&prog, shadow_frame_ptr);
> +
>         ilen = prog - temp;
>         if (rw_image)
>                 memcpy(rw_image + proglen, temp, ilen);
> @@ -1383,6 +1421,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                 u8 *func;
>                 int nops;
>
> +               if (enable_shadow_stack) {
> +                       if (src_reg == BPF_REG_FP)
> +                               src_reg = X86_REG_R9;
> +
> +                       if (dst_reg == BPF_REG_FP)
> +                               dst_reg = X86_REG_R9;

the verifier will reject a prog that attempts to write into R10.
So the above shouldn't be necessary.

> +               }
> +
>                 switch (insn->code) {
>                         /* ALU */
>                 case BPF_ALU | BPF_ADD | BPF_X:
> @@ -2014,6 +2060,7 @@ st:                       if (is_imm8(insn->off))
>                                 emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0);
>                                 /* Restore R0 after clobbering RAX */
>                                 emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
> +
>                                 break;
>                         }
>
> @@ -2038,14 +2085,20 @@ st:                     if (is_imm8(insn->off))
>
>                         func = (u8 *) __bpf_call_base + imm32;
>                         if (tail_call_reachable) {
> -                               RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
> +                               RESTORE_TAIL_CALL_CNT(stack_depth);
>                                 ip += 7;
>                         }
>                         if (!imm32)
>                                 return -EINVAL;
> +                       if (enable_shadow_stack) {
> +                               EMIT2(0x41, 0x51);
> +                               ip += 2;
> +                       }
>                         ip += x86_call_depth_emit_accounting(&prog, func, ip);
>                         if (emit_call(&prog, func, ip))
>                                 return -EINVAL;
> +                       if (enable_shadow_stack)
> +                               EMIT2(0x41, 0x59);

push/pop around calls are load/store plus math on %rsp.
I think it's cheaper to reload r9 after the call with
a single insn.
The reload of r9 is effectively gs+const.
There is no memory access. So it should be faster.

Technically we can replace all uses of R10==rbp with
'gs:' based instructions.
Like:
r1 = r10
can be jitted into
lea rdi, gs + (u32)shadow_frame_ptr

and r0 = *(u32 *)(r10 - 64)
can be jitted into:
mov rax, dword ptr gs:[(u32)shadow_frame_ptr - 64]

but that is probably a bunch of jit changes.
So I'd start with a simple reload of r9 after each call.

We need to micro-benchmark it to make sure there is no perf overhead.