Re: [PATCH bpf-next 2/4] bpf, x64: Fix tailcall hierarchy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2024/2/15 07:16, Alexei Starovoitov wrote:
> On Tue, Feb 13, 2024 at 9:47 PM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote:
>>
>>
>>
>> On 2024/1/5 12:15, Alexei Starovoitov wrote:
>>> On Thu, Jan 4, 2024 at 6:23 AM Leon Hwang <hffilwlqm@xxxxxxxxx> wrote:
>>>>
>>>>
>>>
>>> Other alternatives?
>>
>> I've finish the POC of an alternative, which passed all tailcall
>> selftests including these tailcall hierarchy ones.
>>
>> In this alternative, I use a new bpf_prog_run_ctx to wrap the original
>> ctx and the tcc_ptr, then get the tcc_ptr and recover the original ctx
>> in JIT.
>>
>> Then, to avoid breaking runtime with tailcall on other arch, I add an
>> arch-related check bpf_jit_supports_tail_call_cnt_ptr() to determin
>> whether to use bpf_prog_run_ctx.
>>

[SNIP]

>> +
>> +       if (bpf_jit_supports_tail_call_cnt_ptr() && prog->jited)
>> +               ret = dfunc(&run_ctx, prog->insnsi, prog->bpf_func);
>> +       else
>> +               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> 
> This is no good either.
> We cannot introduce two extra run-time checks before calling every bpf prog.
> The solution must be overhead free for common cases.
> 
> Can we switch to percpu tail_call_cnt instead of on stack and %rax tricks ?
> 

Good idea to use percpu tail_call_cnt.

I did another POC to use percpu tail_call_cnt, which passed all tailcall
selftests too.

In this POC, in order to prepare tcc_ptr at the prologue of x86 JIT,
it's to call bpf_tail_call_cnt_prepare() to get the pointer that points
to percpu tail_call_cnt, and to store the pointer to %rax meanwhile.

Here's the diff:

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4065bdcc5b2a4..fc1df6a7d87c9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -241,6 +241,8 @@ int bpf_arch_text_invalidate(void *dst, size_t len)
 }

 struct jit_context {
+	int prologue_tail_call_offset;
+
 	int cleanup_addr; /* Epilogue code offset */

 	/*
@@ -250,6 +252,8 @@ struct jit_context {
 	 */
 	int tail_call_direct_label;
 	int tail_call_indirect_label;
+
+	bool tail_call_reachable;
 };

 /* Maximum number of bytes emitted while JITing one eBPF insn */
@@ -259,7 +263,7 @@ struct jit_context {
 /* Number of bytes emit_patch() needs to generate instructions */
 #define X86_PATCH_SIZE		5
 /* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET	(22 + ENDBR_INSN_SIZE)
+#define X86_TAIL_CALL_OFFSET	(14 + ENDBR_INSN_SIZE)

 static void push_r12(u8 **pprog)
 {
@@ -389,6 +393,19 @@ static void emit_cfi(u8 **pprog, u32 hash)
 	*pprog = prog;
 }

+DEFINE_PER_CPU(u32, bpf_tail_call_cnt);
+
+__attribute__((used))
+static u32 *bpf_tail_call_cnt_prepare(void)
+{
+	u32 *tcc_ptr = this_cpu_ptr(&bpf_tail_call_cnt);
+
+	/* Initialise tail_call_cnt. */
+	*tcc_ptr = 0;
+
+	return tcc_ptr;
+}
+
 /*
  * Emit x86-64 prologue code for BPF program.
  * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -396,7 +413,7 @@ static void emit_cfi(u8 **pprog, u32 hash)
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			  bool tail_call_reachable, bool is_subprog,
-			  bool is_exception_cb)
+			  bool is_exception_cb, struct jit_context *ctx)
 {
 	u8 *prog = *pprog;

@@ -406,21 +423,15 @@ static void emit_prologue(u8 **pprog, u32
stack_depth, bool ebpf_from_cbpf,
 	 */
 	emit_nops(&prog, X86_PATCH_SIZE);
 	if (!ebpf_from_cbpf) {
-		if (tail_call_reachable && !is_subprog) {
-			/* When it's the entry of the whole tailcall context,
-			 * zeroing rax means initialising tail_call_cnt.
-			 */
-			EMIT2(0x31, 0xC0);       /* xor eax, eax */
-			EMIT1(0x50);             /* push rax */
-			/* Make rax as ptr that points to tail_call_cnt. */
-			EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */
-			EMIT1_off32(0xE8, 2);    /* call main prog */
-			EMIT1(0x59);             /* pop rcx, get rid of tail_call_cnt */
-			EMIT1(0xC3);             /* ret */
-		} else {
-			/* Keep the same instruction size. */
-			emit_nops(&prog, 13);
-		}
+		/* These 5-bytes nops is prepared to emit_call() to call
+		 * bpf_tail_call_cnt_prepare later.
+		 *
+		 * After calling bpf_tail_call_cnt_prepare, %rax will be
+		 * the tail_call_cnt pointer that points to an initialised
+		 * PER-CPU tail_call_cnt.
+		 */
+		ctx->prologue_tail_call_offset = prog - *pprog;
+		emit_nops(&prog, X86_PATCH_SIZE);
 	}
 	/* Exception callback receives FP as third parameter */
 	if (is_exception_cb) {
@@ -583,6 +594,17 @@ static void emit_return(u8 **pprog, u8 *ip)
 	*pprog = prog;
 }

+static void bpf_tail_call_prologue_fixup(u8 *image, struct bpf_prog *prog,
+					 struct jit_context *ctx)
+{
+	bool ebpf_from_cbpf = bpf_prog_was_classic(prog);
+	u8 *ip = image + ctx->prologue_tail_call_offset;
+
+	if (!ebpf_from_cbpf && ctx->tail_call_reachable && !bpf_is_subprog(prog))
+		__bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL,
+				     bpf_tail_call_cnt_prepare);
+}
+
 /*
  * Generate the following code:
  *
@@ -1165,10 +1187,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int
*addrs, u8 *image, u8 *rw_image

 	/* tail call's presence in current prog implies it is reachable */
 	tail_call_reachable |= tail_call_seen;
+	ctx->tail_call_reachable = tail_call_reachable;

 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb,
+		      ctx);
 	/* Exception callback will clobber callee regs for its own use, and
 	 * restore the original callee regs from main prog's stack frame.
 	 */
@@ -3097,6 +3121,7 @@ struct bpf_prog *bpf_int_jit_compile(struct
bpf_prog *prog)
 			}

 			bpf_tail_call_direct_fixup(prog);
+			bpf_tail_call_prologue_fixup(image, prog, &ctx);
 		} else {
 			jit_data->addrs = addrs;
 			jit_data->ctx = ctx;

Thanks,
Leon




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux