On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote: > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > reading ahead beyond its intended data, and causing a crash if the next > block is beyond page boundary: > http://marc.info/?l=linux-crypto-vger&m=149373371023377 > > This patch makes sure that there is no overflow for any buffer length. > > It passes the tests written by Jan Stancek that revealed this problem: > https://github.com/jstancek/sha1-avx2-crash Hi Jan, Is it ok to add your Tested-by? > > I have re-enabled sha1-avx2 by reverting commit > b82ce24426a4071da9529d726057e4e642948667 > > Originally-by: Ilya Albrekht <ilya.albrekht@xxxxxxxxx> > Signed-off-by: Megha Dey <megha.dey@xxxxxxxxxxxxxxx> > Reported-by: Jan Stancek <jstancek@xxxxxxxxxx> > --- > arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++++++---------------- > arch/x86/crypto/sha1_ssse3_glue.c | 2 +- > 2 files changed, 37 insertions(+), 32 deletions(-) > > diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S > index 1cd792d..1eab79c 100644 > --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S > +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S > @@ -117,11 +117,10 @@ > .set T1, REG_T1 > .endm > > -#define K_BASE %r8 > #define HASH_PTR %r9 > +#define BLOCKS_CTR %r8 > #define BUFFER_PTR %r10 > #define BUFFER_PTR2 %r13 > -#define BUFFER_END %r11 > > #define PRECALC_BUF %r14 > #define WK_BUF %r15 > @@ -205,14 +204,14 @@ > * blended AVX2 and ALU instruction scheduling > * 1 vector iteration per 8 rounds > */ > - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP > + vmovdqu (i * 2)(BUFFER_PTR), W_TMP > .elseif ((i & 7) == 1) > - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ > + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ > WY_TMP, WY_TMP > .elseif ((i & 7) == 2) > vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY > .elseif ((i & 7) == 4) > - vpaddd K_XMM(K_BASE), WY, WY_TMP > + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP > .elseif ((i & 7) == 7) > vmovdqu WY_TMP, PRECALC_WK(i&~7) > > @@ -255,7 +254,7 @@ > vpxor WY, WY_TMP, WY_TMP > .elseif ((i & 7) == 7) > vpxor WY_TMP2, WY_TMP, WY > - vpaddd K_XMM(K_BASE), WY, WY_TMP > + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP > vmovdqu WY_TMP, PRECALC_WK(i&~7) > > PRECALC_ROTATE_WY > @@ -291,7 +290,7 @@ > vpsrld $30, WY, WY > vpor WY, WY_TMP, WY > .elseif ((i & 7) == 7) > - vpaddd K_XMM(K_BASE), WY, WY_TMP > + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP > vmovdqu WY_TMP, PRECALC_WK(i&~7) > > PRECALC_ROTATE_WY > @@ -446,6 +445,16 @@ > > .endm > > +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) > + * %1 + %2 >= %3 ? %4 : 0 > + */ > +.macro ADD_IF_GE a, b, c, d > + mov \a, RTA > + add $\d, RTA > + cmp $\c, \b > + cmovge RTA, \a > +.endm > + > /* > * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining > */ > @@ -463,13 +472,16 @@ > lea (2*4*80+32)(%rsp), WK_BUF > > # Precalc WK for first 2 blocks > - PRECALC_OFFSET = 0 > + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 > .set i, 0 > .rept 160 > PRECALC i > .set i, i + 1 > .endr > - PRECALC_OFFSET = 128 > + > + /* Go to next block if needed */ > + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 > + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 > xchg WK_BUF, PRECALC_BUF > > .align 32 > @@ -479,8 +491,8 @@ _loop: > * we use K_BASE value as a signal of a last block, > * it is set below by: cmovae BUFFER_PTR, K_BASE > */ > - cmp K_BASE, BUFFER_PTR > - jne _begin > + test BLOCKS_CTR, BLOCKS_CTR > + jnz _begin > .align 32 > jmp _end > .align 32 > @@ -512,10 +524,10 @@ _loop0: > .set j, j+2 > .endr > > - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ > - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ > - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ > - > + /* Update Counter */ > + sub $1, BLOCKS_CTR > + /* Move to the next block only if needed*/ > + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 > /* > * rounds > * 60,62,64,66,68 > @@ -532,8 +544,8 @@ _loop0: > UPDATE_HASH 12(HASH_PTR), D > UPDATE_HASH 16(HASH_PTR), E > > - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ > - je _loop > + test BLOCKS_CTR, BLOCKS_CTR > + jz _loop > > mov TB, B > > @@ -575,10 +587,10 @@ _loop2: > .set j, j+2 > .endr > > - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ > - > - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ > - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ > + /* update counter */ > + sub $1, BLOCKS_CTR > + /* Move to the next block only if needed*/ > + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 > > jmp _loop3 > _loop3: > @@ -641,19 +653,12 @@ _loop3: > > avx2_zeroupper > > - lea K_XMM_AR(%rip), K_BASE > - > + /* Setup initial values */ > mov CTX, HASH_PTR > mov BUF, BUFFER_PTR > - lea 64(BUF), BUFFER_PTR2 > - > - shl $6, CNT /* mul by 64 */ > - add BUF, CNT > - add $64, CNT > - mov CNT, BUFFER_END > > - cmp BUFFER_END, BUFFER_PTR2 > - cmovae K_BASE, BUFFER_PTR2 > + mov BUF, BUFFER_PTR2 > + mov CNT, BLOCKS_CTR > > xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP > > diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c > index f960a04..fc61739 100644 > --- a/arch/x86/crypto/sha1_ssse3_glue.c > +++ b/arch/x86/crypto/sha1_ssse3_glue.c > @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, > > static bool avx2_usable(void) > { > - if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) > + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) > && boot_cpu_has(X86_FEATURE_BMI1) > && boot_cpu_has(X86_FEATURE_BMI2)) > return true;