On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote: > The SIMD routine ported from x86 used to have a special code path > for inputs < 16 bytes, which got lost somewhere along the way. > Instead, the current glue code aligns the input pointer to permit > the NEON routine to use special versions of the vld1 instructions > that assume 16 byte alignment, but this could result in inputs of > less than 16 bytes to be passed in. This not only fails the new > extended tests that Eric has implemented, it also results in the > code reading before the input pointer, which could potentially > result in crashes when dealing with less than 16 bytes of input > at the start of a page which is preceded by an unmapped page. > > So update the glue code to only invoke the NEON routine if the > input is more than 16 bytes. > > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> Can you add proper tags? Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM") Cc: <stable@xxxxxxxxxxxxxxx> # v4.10+ Just double checking as I don't have a system immediately available to run this one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now? Another comment below: > --- > arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++--------- > arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++--------------- > 2 files changed, 16 insertions(+), 27 deletions(-) > > diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S > index ce45ba0c0687..3fd13d7c842c 100644 > --- a/arch/arm/crypto/crct10dif-ce-core.S > +++ b/arch/arm/crypto/crct10dif-ce-core.S > @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull) > vext.8 q10, qzr, q0, #4 > > // receive the initial 64B data, xor the initial crc value > - vld1.64 {q0-q1}, [arg2, :128]! > - vld1.64 {q2-q3}, [arg2, :128]! > - vld1.64 {q4-q5}, [arg2, :128]! > - vld1.64 {q6-q7}, [arg2, :128]! > + vld1.64 {q0-q1}, [arg2]! > + vld1.64 {q2-q3}, [arg2]! > + vld1.64 {q4-q5}, [arg2]! > + vld1.64 {q6-q7}, [arg2]! > CPU_LE( vrev64.8 q0, q0 ) > CPU_LE( vrev64.8 q1, q1 ) > CPU_LE( vrev64.8 q2, q2 ) > @@ -150,7 +150,7 @@ CPU_LE( vrev64.8 q7, q7 ) > veor.8 q0, q0, q10 > > adr ip, rk3 > - vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 > + vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4 This one is loading static data that is 16 byte aligned, so the :128 can be kept here. Same in the two other places below that load from [ip]. > > // > // we subtract 256 instead of 128 to save one instruction from the loop > @@ -167,7 +167,7 @@ CPU_LE( vrev64.8 q7, q7 ) > _fold_64_B_loop: > > .macro fold64, reg1, reg2 > - vld1.64 {q11-q12}, [arg2, :128]! > + vld1.64 {q11-q12}, [arg2]! > > vmull.p64 q8, \reg1\()h, d21 > vmull.p64 \reg1, \reg1\()l, d20 > @@ -203,13 +203,13 @@ CPU_LE( vrev64.8 q12, q12 ) > // constants > > adr ip, rk9 > - vld1.64 {q10}, [ip, :128]! > + vld1.64 {q10}, [ip]! > > .macro fold16, reg, rk > vmull.p64 q8, \reg\()l, d20 > vmull.p64 \reg, \reg\()h, d21 > .ifnb \rk > - vld1.64 {q10}, [ip, :128]! > + vld1.64 {q10}, [ip]! > .endif > veor.8 q7, q7, q8 > veor.8 q7, q7, \reg > @@ -238,7 +238,7 @@ _16B_reduction_loop: > vmull.p64 q7, d15, d21 > veor.8 q7, q7, q8 > > - vld1.64 {q0}, [arg2, :128]! > + vld1.64 {q0}, [arg2]! > CPU_LE( vrev64.8 q0, q0 ) > vswp d0, d1 > veor.8 q7, q7, q0 > @@ -335,7 +335,7 @@ _less_than_128: > vmov.i8 q0, #0 > vmov s3, arg1_low32 // get the initial crc value > > - vld1.64 {q7}, [arg2, :128]! > + vld1.64 {q7}, [arg2]! > CPU_LE( vrev64.8 q7, q7 ) > vswp d14, d15 > veor.8 q7, q7, q0 > diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c > index d428355cf38d..14c19c70a841 100644 > --- a/arch/arm/crypto/crct10dif-ce-glue.c > +++ b/arch/arm/crypto/crct10dif-ce-glue.c > @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, > unsigned int length) > { > u16 *crc = shash_desc_ctx(desc); > - unsigned int l; > > - if (!may_use_simd()) { > - *crc = crc_t10dif_generic(*crc, data, length); > + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) { > + kernel_neon_begin(); > + *crc = crc_t10dif_pmull(*crc, data, length); > + kernel_neon_end(); > } else { > - if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) { > - l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE - > - ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)); > - > - *crc = crc_t10dif_generic(*crc, data, l); > - > - length -= l; > - data += l; > - } > - if (length > 0) { > - kernel_neon_begin(); > - *crc = crc_t10dif_pmull(*crc, data, length); > - kernel_neon_end(); > - } > + *crc = crc_t10dif_generic(*crc, data, length); > } > + > return 0; > } > > -- > 2.17.1 >