On Fri, 25 Jan 2019 at 08:22, Eric Biggers <ebiggers@xxxxxxxxxx> wrote: > > On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote: > > The SIMD routine ported from x86 used to have a special code path > > for inputs < 16 bytes, which got lost somewhere along the way. > > Instead, the current glue code aligns the input pointer to permit > > the NEON routine to use special versions of the vld1 instructions > > that assume 16 byte alignment, but this could result in inputs of > > less than 16 bytes to be passed in. This not only fails the new > > extended tests that Eric has implemented, it also results in the > > code reading before the input pointer, which could potentially > > result in crashes when dealing with less than 16 bytes of input > > at the start of a page which is preceded by an unmapped page. > > > > So update the glue code to only invoke the NEON routine if the > > input is more than 16 bytes. > > > > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> > > Can you add proper tags? > > Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM") > Cc: <stable@xxxxxxxxxxxxxxx> # v4.10+ > Ah yes, thanks for digging that up. > Just double checking as I don't have a system immediately available to run this > one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now? > Yes. > Another comment below: > > > --- > > arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++--------- > > arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++--------------- > > 2 files changed, 16 insertions(+), 27 deletions(-) > > > > diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S > > index ce45ba0c0687..3fd13d7c842c 100644 > > --- a/arch/arm/crypto/crct10dif-ce-core.S > > +++ b/arch/arm/crypto/crct10dif-ce-core.S > > @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull) > > vext.8 q10, qzr, q0, #4 > > > > // receive the initial 64B data, xor the initial crc value > > - vld1.64 {q0-q1}, [arg2, :128]! > > - vld1.64 {q2-q3}, [arg2, :128]! > > - vld1.64 {q4-q5}, [arg2, :128]! > > - vld1.64 {q6-q7}, [arg2, :128]! > > + vld1.64 {q0-q1}, [arg2]! > > + vld1.64 {q2-q3}, [arg2]! > > + vld1.64 {q4-q5}, [arg2]! > > + vld1.64 {q6-q7}, [arg2]! > > CPU_LE( vrev64.8 q0, q0 ) > > CPU_LE( vrev64.8 q1, q1 ) > > CPU_LE( vrev64.8 q2, q2 ) > > @@ -150,7 +150,7 @@ CPU_LE( vrev64.8 q7, q7 ) > > veor.8 q0, q0, q10 > > > > adr ip, rk3 > > - vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 > > + vld1.64 {q10}, [ip] // xmm10 has rk3 and rk4 > > This one is loading static data that is 16 byte aligned, so the :128 can be kept > here. Same in the two other places below that load from [ip]. > OK, will change that back. > > > > // > > // we subtract 256 instead of 128 to save one instruction from the loop > > @@ -167,7 +167,7 @@ CPU_LE( vrev64.8 q7, q7 ) > > _fold_64_B_loop: > > > > .macro fold64, reg1, reg2 > > - vld1.64 {q11-q12}, [arg2, :128]! > > + vld1.64 {q11-q12}, [arg2]! > > > > vmull.p64 q8, \reg1\()h, d21 > > vmull.p64 \reg1, \reg1\()l, d20 > > @@ -203,13 +203,13 @@ CPU_LE( vrev64.8 q12, q12 ) > > // constants > > > > adr ip, rk9 > > - vld1.64 {q10}, [ip, :128]! > > + vld1.64 {q10}, [ip]! > > > > .macro fold16, reg, rk > > vmull.p64 q8, \reg\()l, d20 > > vmull.p64 \reg, \reg\()h, d21 > > .ifnb \rk > > - vld1.64 {q10}, [ip, :128]! > > + vld1.64 {q10}, [ip]! > > .endif > > veor.8 q7, q7, q8 > > veor.8 q7, q7, \reg > > @@ -238,7 +238,7 @@ _16B_reduction_loop: > > vmull.p64 q7, d15, d21 > > veor.8 q7, q7, q8 > > > > - vld1.64 {q0}, [arg2, :128]! > > + vld1.64 {q0}, [arg2]! > > CPU_LE( vrev64.8 q0, q0 ) > > vswp d0, d1 > > veor.8 q7, q7, q0 > > @@ -335,7 +335,7 @@ _less_than_128: > > vmov.i8 q0, #0 > > vmov s3, arg1_low32 // get the initial crc value > > > > - vld1.64 {q7}, [arg2, :128]! > > + vld1.64 {q7}, [arg2]! > > CPU_LE( vrev64.8 q7, q7 ) > > vswp d14, d15 > > veor.8 q7, q7, q0 > > diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c > > index d428355cf38d..14c19c70a841 100644 > > --- a/arch/arm/crypto/crct10dif-ce-glue.c > > +++ b/arch/arm/crypto/crct10dif-ce-glue.c > > @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, > > unsigned int length) > > { > > u16 *crc = shash_desc_ctx(desc); > > - unsigned int l; > > > > - if (!may_use_simd()) { > > - *crc = crc_t10dif_generic(*crc, data, length); > > + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) { > > + kernel_neon_begin(); > > + *crc = crc_t10dif_pmull(*crc, data, length); > > + kernel_neon_end(); > > } else { > > - if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) { > > - l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE - > > - ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)); > > - > > - *crc = crc_t10dif_generic(*crc, data, l); > > - > > - length -= l; > > - data += l; > > - } > > - if (length > 0) { > > - kernel_neon_begin(); > > - *crc = crc_t10dif_pmull(*crc, data, length); > > - kernel_neon_end(); > > - } > > + *crc = crc_t10dif_generic(*crc, data, length); > > } > > + > > return 0; > > } > > > > -- > > 2.17.1 > >