Re: [PATCH 1/2] crypto: arm/crct10dif - revert to C code for short inputs

Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx> · Fri, 25 Jan 2019 08:48:31 +0100

On Fri, 25 Jan 2019 at 08:22, Eric Biggers <ebiggers@xxxxxxxxxx> wrote:
>
> On Thu, Jan 24, 2019 at 07:27:11PM +0100, Ard Biesheuvel wrote:
> > The SIMD routine ported from x86 used to have a special code path
> > for inputs < 16 bytes, which got lost somewhere along the way.
> > Instead, the current glue code aligns the input pointer to permit
> > the NEON routine to use special versions of the vld1 instructions
> > that assume 16 byte alignment, but this could result in inputs of
> > less than 16 bytes to be passed in. This not only fails the new
> > extended tests that Eric has implemented, it also results in the
> > code reading before the input pointer, which could potentially
> > result in crashes when dealing with less than 16 bytes of input
> > at the start of a page which is preceded by an unmapped page.
> >
> > So update the glue code to only invoke the NEON routine if the
> > input is more than 16 bytes.
> >
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
>
> Can you add proper tags?
>
> Fixes: 1d481f1cd892 ("crypto: arm/crct10dif - port x86 SSE implementation to ARM")
> Cc: <stable@xxxxxxxxxxxxxxx> # v4.10+
>

Ah yes, thanks for digging that up.

> Just double checking as I don't have a system immediately available to run this
> one on -- I assume it passes with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y now?
>

Yes.

> Another comment below:
>
> > ---
> >  arch/arm/crypto/crct10dif-ce-core.S | 20 ++++++++---------
> >  arch/arm/crypto/crct10dif-ce-glue.c | 23 +++++---------------
> >  2 files changed, 16 insertions(+), 27 deletions(-)
> >
> > diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
> > index ce45ba0c0687..3fd13d7c842c 100644
> > --- a/arch/arm/crypto/crct10dif-ce-core.S
> > +++ b/arch/arm/crypto/crct10dif-ce-core.S
> > @@ -124,10 +124,10 @@ ENTRY(crc_t10dif_pmull)
> >       vext.8          q10, qzr, q0, #4
> >
> >       // receive the initial 64B data, xor the initial crc value
> > -     vld1.64         {q0-q1}, [arg2, :128]!
> > -     vld1.64         {q2-q3}, [arg2, :128]!
> > -     vld1.64         {q4-q5}, [arg2, :128]!
> > -     vld1.64         {q6-q7}, [arg2, :128]!
> > +     vld1.64         {q0-q1}, [arg2]!
> > +     vld1.64         {q2-q3}, [arg2]!
> > +     vld1.64         {q4-q5}, [arg2]!
> > +     vld1.64         {q6-q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0                  )
> >  CPU_LE(      vrev64.8        q1, q1                  )
> >  CPU_LE(      vrev64.8        q2, q2                  )
> > @@ -150,7 +150,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >       veor.8          q0, q0, q10
> >
> >       adr             ip, rk3
> > -     vld1.64         {q10}, [ip, :128]       // xmm10 has rk3 and rk4
> > +     vld1.64         {q10}, [ip]     // xmm10 has rk3 and rk4
>
> This one is loading static data that is 16 byte aligned, so the :128 can be kept
> here.  Same in the two other places below that load from [ip].
>

OK, will change that back.

> >
> >       //
> >       // we subtract 256 instead of 128 to save one instruction from the loop
> > @@ -167,7 +167,7 @@ CPU_LE(   vrev64.8        q7, q7                  )
> >  _fold_64_B_loop:
> >
> >       .macro          fold64, reg1, reg2
> > -     vld1.64         {q11-q12}, [arg2, :128]!
> > +     vld1.64         {q11-q12}, [arg2]!
> >
> >       vmull.p64       q8, \reg1\()h, d21
> >       vmull.p64       \reg1, \reg1\()l, d20
> > @@ -203,13 +203,13 @@ CPU_LE( vrev64.8        q12, q12                )
> >       // constants
> >
> >       adr             ip, rk9
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >
> >       .macro          fold16, reg, rk
> >       vmull.p64       q8, \reg\()l, d20
> >       vmull.p64       \reg, \reg\()h, d21
> >       .ifnb           \rk
> > -     vld1.64         {q10}, [ip, :128]!
> > +     vld1.64         {q10}, [ip]!
> >       .endif
> >       veor.8          q7, q7, q8
> >       veor.8          q7, q7, \reg
> > @@ -238,7 +238,7 @@ _16B_reduction_loop:
> >       vmull.p64       q7, d15, d21
> >       veor.8          q7, q7, q8
> >
> > -     vld1.64         {q0}, [arg2, :128]!
> > +     vld1.64         {q0}, [arg2]!
> >  CPU_LE(      vrev64.8        q0, q0          )
> >       vswp            d0, d1
> >       veor.8          q7, q7, q0
> > @@ -335,7 +335,7 @@ _less_than_128:
> >       vmov.i8         q0, #0
> >       vmov            s3, arg1_low32          // get the initial crc value
> >
> > -     vld1.64         {q7}, [arg2, :128]!
> > +     vld1.64         {q7}, [arg2]!
> >  CPU_LE(      vrev64.8        q7, q7          )
> >       vswp            d14, d15
> >       veor.8          q7, q7, q0
> > diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
> > index d428355cf38d..14c19c70a841 100644
> > --- a/arch/arm/crypto/crct10dif-ce-glue.c
> > +++ b/arch/arm/crypto/crct10dif-ce-glue.c
> > @@ -35,26 +35,15 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
> >                           unsigned int length)
> >  {
> >       u16 *crc = shash_desc_ctx(desc);
> > -     unsigned int l;
> >
> > -     if (!may_use_simd()) {
> > -             *crc = crc_t10dif_generic(*crc, data, length);
> > +     if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
> > +             kernel_neon_begin();
> > +             *crc = crc_t10dif_pmull(*crc, data, length);
> > +             kernel_neon_end();
> >       } else {
> > -             if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
> > -                     l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
> > -                               ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
> > -
> > -                     *crc = crc_t10dif_generic(*crc, data, l);
> > -
> > -                     length -= l;
> > -                     data += l;
> > -             }
> > -             if (length > 0) {
> > -                     kernel_neon_begin();
> > -                     *crc = crc_t10dif_pmull(*crc, data, length);
> > -                     kernel_neon_end();
> > -             }
> > +             *crc = crc_t10dif_generic(*crc, data, length);
> >       }
> > +
> >       return 0;
> >  }
> >
> > --
> > 2.17.1
> >