[PATCH bpf-next 1/2] bpf, arm: Optimize ALU64 ARSH X using orrpl conditional instruction

Luke Nelson <lukenels@xxxxxxxxxxxxxxxxx> · Thu, 30 Apr 2020 19:02:09 -0700

This patch optimizes the code generated by emit_a32_arsh_r64, which
handles the BPF_ALU64 BPF_ARSH BPF_X instruction.

The original code uses a conditional B followed by an unconditional ORR.
The optimization saves one instruction by removing the B instruction
and using a conditional ORR (with an inverted condition).

Example of the code generated for BPF_ALU64_REG(BPF_ARSH, BPF_REG_0,
BPF_REG_1), before optimization:

  34:  rsb    ip, r2, #32
  38:  subs   r9, r2, #32
  3c:  lsr    lr, r0, r2
  40:  orr    lr, lr, r1, lsl ip
  44:  bmi    0x4c
  48:  orr    lr, lr, r1, asr r9
  4c:  asr    ip, r1, r2
  50:  mov    r0, lr
  54:  mov    r1, ip

and after optimization:

  34:  rsb    ip, r2, #32
  38:  subs   r9, r2, #32
  3c:  lsr    lr, r0, r2
  40:  orr    lr, lr, r1, lsl ip
  44:  orrpl  lr, lr, r1, asr r9
  48:  asr    ip, r1, r2
  4c:  mov    r0, lr
  50:  mov    r1, ip

Tested on QEMU using lib/test_bpf and test_verifier.

Co-developed-by: Xi Wang <xi.wang@xxxxxxxxx>
Signed-off-by: Xi Wang <xi.wang@xxxxxxxxx>
Signed-off-by: Luke Nelson <luke.r.nels@xxxxxxxxx>
---
 arch/arm/net/bpf_jit_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index bf85d6db4931..48b89211ee5c 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -860,8 +860,8 @@ static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
 	emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
 	emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
 	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
-	_emit(ARM_COND_MI, ARM_B(0), ctx);
-	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
+	_emit(ARM_COND_PL,
+	      ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
 	emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx);
 
 	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
-- 
2.17.1