Re: [PATCH bpf-next v2 1/4] bpf, x64: Fix tailcall infinite loop caused by freplace

Leon Hwang <leon.hwang@xxxxxxxxx> · Sun, 15 Sep 2024 21:00:31 +0800

On 2024/9/14 03:28, Alexei Starovoitov wrote:
> On Sun, Sep 1, 2024 at 6:41 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote:
>>
>> @@ -573,10 +575,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
>>
>>         /*
>>          * See emit_prologue(), for IBT builds the trampoline hook is preceded
>> -        * with an ENDBR instruction.
>> +        * with an ENDBR instruction and 3 bytes tail_call_cnt initialization
>> +        * instruction.
>>          */
>>         if (is_endbr(*(u32 *)ip))
>>                 ip += ENDBR_INSN_SIZE;
>> +       if (is_bpf_text_address((long)ip))
>> +               ip += X86_POKE_EXTRA;
> 
> This is a foot gun.
> bpf_arch_text_poke() is used not only at the beginning of the function.
> So unconditional ip += 3 is not just puzzling with 'what is this for',
> but dangerous and wasteful...
> 
>> @@ -2923,6 +2930,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>>                  */
>>                 if (is_endbr(*(u32 *)orig_call))
>>                         orig_call += ENDBR_INSN_SIZE;
>> +               if (is_bpf_text_address((long)orig_call))
>> +                       orig_call += X86_POKE_EXTRA;
>>                 orig_call += X86_PATCH_SIZE;
>>         }
> 
> ..this bit needs to be hacked too...
> 
>> @@ -3025,6 +3034,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
>>                 /* remember return value in a stack for bpf prog to access */
>>                 emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
>>                 im->ip_after_call = image + (prog - (u8 *)rw_image);
>> +               emit_nops(&prog, X86_POKE_EXTRA);
>>                 emit_nops(&prog, X86_PATCH_SIZE);
> 
> And this is just pure waste of kernel code and cpu run-time.
> 
> You're adding 3 byte nop for no reason at all.
> 
> See commit e21aa341785c ("bpf: Fix fexit trampoline.")
> that added:
>                 int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
>                                              NULL, im->ip_epilogue);
> logic that is patching bpf trampoline in the middle of it.
> (not at the start).
> 
> Because of unconditional +=3 in bpf_arch_text_poke() every trampoline
> will have to waste nop3 ?
> No.
> 
> Please fix freplace and tail call combination without
> this kind of unacceptable shortcuts.
> 
> I very much prefer to stop hacking into JITs and trampolines because
> tailcalls and freplace don't work well together.
> 
> We cannot completely disable that combination because libxdp
> is using freplace to populate chain of progs the main prog is calling
> and these freplace progs might be doing tailcall,
> but we can still prevent such infinite loop that you describe in commit log:
> entry_tc -> subprog_tc -> entry_freplace -> subprog_tail --tailcall-> entry_tc
> in the verifier without resorting to JIT hacks.
> 
> pw-bot: cr

IIUC, it's going to prevent this niche case at update/attach time, that
a prog cannot be updated to prog_array map and be extended by freplace
prog at the same time.

More specific explanation:

1. If a prog has been updated to prog_array, it and its subprog cannot
   be extended by freplace prog.
2. If a prog or its subprog has been extended by freplace prog, it
   cannot be updated to prog_array map.

Then, I do a POC[0] to prevent this niche case. And the POC does not
break any selftest.

[0] https://github.com/kernel-patches/bpf/pull/7732

Here's the diff of the POC.

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0c3893c471711..b864b37e67c17 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1483,6 +1483,8 @@ struct bpf_prog_aux {
 	bool xdp_has_frags;
 	bool exception_cb;
 	bool exception_boundary;
+	bool is_extended; /* true if extended by freplace program */
+	atomic_t tail_callee_cnt;
 	struct bpf_arena *arena;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 79660e3fca4c1..be41240d4fb3a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -951,18 +951,27 @@ static void *prog_fd_array_get_ptr(struct bpf_map
*map,
 	if (IS_ERR(prog))
 		return prog;

-	if (!bpf_prog_map_compatible(map, prog)) {
+	if (!bpf_prog_map_compatible(map, prog) || prog->aux->is_extended) {
+		/* Extended prog can not be tail callee. It's to prevent a
+		 * potential infinite loop like:
+		 * tail callee prog entry -> tail callee prog subprog ->
+		 * freplace prog entry --tailcall-> tail callee prog entry.
+		 */
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}

+	atomic_inc(&prog->aux->tail_callee_cnt);
 	return prog;
 }

 static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool
need_defer)
 {
+	struct bpf_prog *prog = ptr;
+
 	/* bpf_prog is freed after one RCU or tasks trace grace period */
-	bpf_prog_put(ptr);
+	atomic_dec(&prog->aux->tail_callee_cnt);
+	bpf_prog_put(prog);
 }

 static u32 prog_fd_array_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a4117f6d7610..be829016d8182 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3292,8 +3292,11 @@ static void bpf_tracing_link_release(struct
bpf_link *link)
 	bpf_trampoline_put(tr_link->trampoline);

 	/* tgt_prog is NULL if target is a kernel function */
-	if (tr_link->tgt_prog)
+	if (tr_link->tgt_prog) {
+		if (link->prog->type == BPF_PROG_TYPE_EXT)
+			tr_link->tgt_prog->aux->is_extended = false;
 		bpf_prog_put(tr_link->tgt_prog);
+	}
 }

 static void bpf_tracing_link_dealloc(struct bpf_link *link)
@@ -3498,6 +3501,18 @@ static int bpf_tracing_prog_attach(struct
bpf_prog *prog,
 		tgt_prog = prog->aux->dst_prog;
 	}

+	if (prog->type == BPF_PROG_TYPE_EXT &&
+	    atomic_read(&tgt_prog->aux->tail_callee_cnt)) {
+		/* Program extensions can not extend target prog when the target
+		 * prog has been updated to any prog_array map as tail callee.
+		 * It's to prevent a potential infinite loop like:
+		 * tgt prog entry -> tgt prog subprog -> freplace prog entry
+		 * --tailcall-> tgt prog entry.
+		 */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	err = bpf_link_prime(&link->link.link, &link_primer);
 	if (err)
 		goto out_unlock;
@@ -3523,6 +3538,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog
*prog,
 	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
 		/* we allocated a new trampoline, so free the old one */
 		bpf_trampoline_put(prog->aux->dst_trampoline);
+	if (prog->type == BPF_PROG_TYPE_EXT)
+		tgt_prog->aux->is_extended = true;

 	prog->aux->dst_prog = NULL;
 	prog->aux->dst_trampoline = NULL;

Thanks,
Leon