Prefer RIP-relative addressing where possible, which removes the need for boot time relocation fixups. Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx> --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 44 +++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 82b716fd5dbac65a..180fb9c78de2d315 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -83,16 +83,20 @@ #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ - movzbl src ## bh, RID1d; \ - movzbl src ## bl, RID2d; \ - shrq $16, src; \ - movl s1(, RID1, 4), dst ## d; \ - op1 s2(, RID2, 4), dst ## d; \ - movzbl src ## bh, RID1d; \ - movzbl src ## bl, RID2d; \ - interleave_op(il_reg); \ - op2 s3(, RID1, 4), dst ## d; \ - op3 s4(, RID2, 4), dst ## d; + movzbl src ## bh, RID1d; \ + leaq s1(%rip), RID2; \ + movl (RID2, RID1, 4), dst ## d; \ + movzbl src ## bl, RID2d; \ + leaq s2(%rip), RID1; \ + op1 (RID1, RID2, 4), dst ## d; \ + shrq $16, src; \ + movzbl src ## bh, RID1d; \ + leaq s3(%rip), RID2; \ + op2 (RID2, RID1, 4), dst ## d; \ + movzbl src ## bl, RID2d; \ + leaq s4(%rip), RID1; \ + op3 (RID1, RID2, 4), dst ## d; \ + interleave_op(il_reg); #define dummy(d) /* do nothing */ @@ -175,10 +179,10 @@ qop(RD, RC, 1); #define shuffle(mask) \ - vpshufb mask, RKR, RKR; + vpshufb mask(%rip), RKR, RKR; #define preload_rkr(n, do_mask, mask) \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor (kr+n*16)(CTX), RKR, RKR; \ do_mask(mask); @@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); -- 2.39.2