On Fri, Apr 5, 2024 at 2:17 AM Puranjay Mohan <puranjay12@xxxxxxxxx> wrote: > > Support an instruction for resolving absolute addresses of per-CPU > data from their per-CPU offsets. This instruction is internal-only and > users are not allowed to use them directly. They will only be used for > internal inlining optimizations for now between BPF verifier and BPF > JITs. > > Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu > access using tpidr_el1"), the per-cpu offset for the CPU is stored in > the tpidr_el1/2 register of that CPU. > > To support this BPF instruction in the ARM64 JIT, the following ARM64 > instructions are emitted: > > mov dst, src // Move src to dst, if src != dst > mrs tmp, tpidr_el1/2 // Move per-cpu offset of the current cpu in tmp. > add dst, dst, tmp // Add the per cpu offset to the dst. > > If CONFIG_SMP is not defined, then nothing is emitted if src == dst, and > mov dst, src is emitted if dst != src. > > To measure the performance improvement provided by this change, the > benchmark in [1] was used: > > Before: > glob-arr-inc : 23.597 ± 0.012M/s > arr-inc : 23.173 ± 0.019M/s > hash-inc : 12.186 ± 0.028M/s > > After: > glob-arr-inc : 23.819 ± 0.034M/s > arr-inc : 23.285 ± 0.017M/s > hash-inc : 12.419 ± 0.011M/s > > [1] https://github.com/anakryiko/linux/commit/8dec900975ef You don't see as big of a gain, because bpf_get_smp_processor_id() is not inlined yet on arm64. But even without it I expected bigger gains. Could you do 'perf report' before/after ? Just want to see what's on top. > > Signed-off-by: Puranjay Mohan <puranjay12@xxxxxxxxx> > --- > arch/arm64/include/asm/insn.h | 7 +++++++ > arch/arm64/lib/insn.c | 11 +++++++++++ > arch/arm64/net/bpf_jit.h | 6 ++++++ > arch/arm64/net/bpf_jit_comp.c | 16 ++++++++++++++++ > 4 files changed, 40 insertions(+) > > diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h > index db1aeacd4cd9..d16d68550c22 100644 > --- a/arch/arm64/include/asm/insn.h > +++ b/arch/arm64/include/asm/insn.h > @@ -135,6 +135,11 @@ enum aarch64_insn_special_register { > AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210 > }; > > +enum aarch64_insn_system_register { > + AARCH64_INSN_SYSREG_TPIDR_EL1 = 0xC684, > + AARCH64_INSN_SYSREG_TPIDR_EL2 = 0xE682, > +}; > + > enum aarch64_insn_variant { > AARCH64_INSN_VARIANT_32BIT, > AARCH64_INSN_VARIANT_64BIT > @@ -686,6 +691,8 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result, > } > #endif > u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type); > +u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, > + enum aarch64_insn_system_register sysreg); > > s32 aarch64_get_branch_offset(u32 insn); > u32 aarch64_set_branch_offset(u32 insn, s32 offset); > diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c > index a635ab83fee3..b008a9b46a7f 100644 > --- a/arch/arm64/lib/insn.c > +++ b/arch/arm64/lib/insn.c > @@ -1515,3 +1515,14 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) > > return insn; > } > + > +u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, > + enum aarch64_insn_system_register sysreg) > +{ > + u32 insn = aarch64_insn_get_mrs_value(); > + > + insn &= ~GENMASK(19, 0); > + insn |= sysreg << 5; > + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, > + insn, result); > +} > diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h > index 23b1b34db088..b627ef7188c7 100644 > --- a/arch/arm64/net/bpf_jit.h > +++ b/arch/arm64/net/bpf_jit.h > @@ -297,4 +297,10 @@ > #define A64_ADR(Rd, offset) \ > aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR) > > +/* MRS */ > +#define A64_MRS_TPIDR_EL1(Rt) \ > + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1) > +#define A64_MRS_TPIDR_EL2(Rt) \ > + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2) > + > #endif /* _BPF_JIT_H */ > diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c > index 76b91f36c729..e9ad9f257a18 100644 > --- a/arch/arm64/net/bpf_jit_comp.c > +++ b/arch/arm64/net/bpf_jit_comp.c > @@ -877,6 +877,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, > emit(A64_ORR(1, tmp, dst, tmp), ctx); > emit(A64_MOV(1, dst, tmp), ctx); > break; > + } else if (insn_is_mov_percpu_addr(insn)) { > + if (dst != src) > + emit(A64_MOV(1, dst, src), ctx); > +#ifdef CONFIG_SMP > + if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) > + emit(A64_MRS_TPIDR_EL2(tmp), ctx); > + else > + emit(A64_MRS_TPIDR_EL1(tmp), ctx); > + emit(A64_ADD(1, dst, dst, tmp), ctx); > +#endif > + break; > } > switch (insn->off) { > case 0: > @@ -2527,6 +2538,11 @@ bool bpf_jit_supports_arena(void) > return true; > } > > +bool bpf_jit_supports_percpu_insn(void) > +{ > + return true; > +} > + > void bpf_jit_free(struct bpf_prog *prog) > { > if (prog->jited) { > -- > 2.40.1 >