Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R12 instead of RBP for the TBL register. Since R12 is also used as another temporary register (T1), it gets clobbered in each round of computation. So the TBL value needs to be freshly reloaded into R12 each time it's used. Since the value of TBL can change, store its permanent value on the stack at the frame_TBL offset. Also remove the unused y4 variable. Reported-by: Eric Biggers <ebiggers3@xxxxxxxxx> Reported-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Signed-off-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx> --- arch/x86/crypto/sha512-avx2-asm.S | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..37cfc2004abd 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -81,7 +81,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %r12 # clobbered by T1 a = %rax b = %rbx @@ -96,11 +96,10 @@ y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 +TBL_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 RSPSAVE_SIZE = 1*8 @@ -108,7 +107,8 @@ GPRSAVE_SIZE = 6*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE -frame_INP = frame_SRND + SRND_SIZE +frame_TBL = frame_SRND + SRND_SIZE +frame_INP = frame_TBL + TBL_SIZE frame_INPEND = frame_INP + INP_SIZE frame_RSPSAVE = frame_INPEND + INPEND_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE @@ -601,7 +601,7 @@ ENTRY(sha512_transform_rorx) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK loop0: - lea K512(%rip), TBL + movq $K512, frame_TBL(%rsp) ## byte swap first 16 dwords COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK @@ -616,39 +616,46 @@ loop0: .align 16 loop1: + mov frame_TBL(%rsp), TBL vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) FOUR_ROUNDS_AND_SCHED + mov frame_TBL(%rsp), TBL vpaddq 1*32(TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) FOUR_ROUNDS_AND_SCHED + mov frame_TBL(%rsp), TBL vpaddq 2*32(TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) FOUR_ROUNDS_AND_SCHED + mov frame_TBL(%rsp), TBL vpaddq 3*32(TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) - add $(4*32), TBL FOUR_ROUNDS_AND_SCHED + addq $(4*32), frame_TBL(%rsp) subq $1, frame_SRND(%rsp) jne loop1 movq $2, frame_SRND(%rsp) loop2: + mov frame_TBL(%rsp), TBL vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) DO_4ROUNDS + + mov frame_TBL(%rsp), TBL vpaddq 1*32(TBL), Y_1, XFER vmovdqa XFER, frame_XFER(%rsp) - add $(2*32), TBL DO_4ROUNDS vmovdqa Y_2, Y_0 vmovdqa Y_3, Y_1 + add $(2*32), frame_TBL(%rsp) subq $1, frame_SRND(%rsp) jne loop2 -- 2.13.5