Hi Eric, On 4 October 2018 at 06:07, Eric Biggers <ebiggers@xxxxxxxxxx> wrote: > From: Eric Biggers <ebiggers@xxxxxxxxxx> > > The generic constant-time AES implementation is supposed to preload the > AES S-box into the CPU's L1 data cache. But, an interrupt handler can > run on the CPU and muck with the cache. Worse, on preemptible kernels > the process can even be preempted and moved to a different CPU. So the > implementation may actually still be vulnerable to cache-timing attacks. > > Make it more robust by disabling interrupts while the sbox is used. > > In some quick tests on x86 and ARM, this doesn't affect performance > significantly. Responsiveness is also a concern, but interrupts are > only disabled for a single AES block which even on ARM Cortex-A7 is > "only" ~1500 cycles to encrypt or ~2600 cycles to decrypt. > I share your concern, but that is quite a big hammer. Also, does it really take ~100 cycles per byte? That is terrible :-) Given that the full lookup table is only 1024 bytes (or 1024+256 bytes for decryption), I wonder if something like the below would be a better option for A7 (apologies for the mangled whitespace) diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index 184d6c2d15d5..83e893f7e581 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -139,6 +139,13 @@ __adrl ttab, \ttab + .irpc r, 01234567 + ldr r8, [ttab, #(32 * \r)] + ldr r9, [ttab, #(32 * \r) + 256] + ldr r10, [ttab, #(32 * \r) + 512] + ldr r11, [ttab, #(32 * \r) + 768] + .endr + tst rounds, #2 bne 1f @@ -180,6 +187,12 @@ ENDPROC(__aes_arm_encrypt) .align 5 ENTRY(__aes_arm_decrypt) + __adrl ttab, __aes_arm_inverse_sbox + + .irpc r, 01234567 + ldr r8, [ttab, #(32 * \r)] + .endr + do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0 ENDPROC(__aes_arm_decrypt) diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c index c222f6e072ad..630e1a436f1d 100644 --- a/arch/arm/crypto/aes-cipher-glue.c +++ b/arch/arm/crypto/aes-cipher-glue.c @@ -23,16 +23,22 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); int rounds = 6 + ctx->key_length / 4; + unsigned long flags; + local_irq_save(flags); __aes_arm_encrypt(ctx->key_enc, rounds, in, out); + local_irq_restore(flags); } static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); int rounds = 6 + ctx->key_length / 4; + unsigned long flags; + local_irq_save(flags); __aes_arm_decrypt(ctx->key_dec, rounds, in, out); + local_irq_restore(flags); } static struct crypto_alg aes_alg = { diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index ca554d57d01e..82fa860c9cb9 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -63,7 +63,7 @@ static inline u8 byte(const u32 x, const unsigned n) static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; -__visible const u32 crypto_ft_tab[4][256] = { +__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = { { 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, @@ -327,7 +327,7 @@ __visible const u32 crypto_ft_tab[4][256] = { } }; -__visible const u32 crypto_fl_tab[4][256] = { +__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = { { 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, @@ -591,7 +591,7 @@ __visible const u32 crypto_fl_tab[4][256] = { } }; -__visible const u32 crypto_it_tab[4][256] = { +__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = { { 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, @@ -855,7 +855,7 @@ __visible const u32 crypto_it_tab[4][256] = { } }; -__visible const u32 crypto_il_tab[4][256] = { +__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = { { 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, 0x00000030, 0x00000036, 0x000000a5, 0x00000038, > Fixes: b5e0b032b6c3 ("crypto: aes - add generic time invariant AES cipher") > Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx> > --- > crypto/aes_ti.c | 18 ++++++++++++++++++ > 1 file changed, 18 insertions(+) > > diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c > index 03023b2290e8e..81b604419ee0e 100644 > --- a/crypto/aes_ti.c > +++ b/crypto/aes_ti.c > @@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > const u32 *rkp = ctx->key_enc + 4; > int rounds = 6 + ctx->key_length / 4; > u32 st0[4], st1[4]; > + unsigned long flags; > int round; > > st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in); > @@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8); > st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12); > > + /* > + * Disable interrupts (including preemption) while the sbox is loaded > + * into L1 cache and used for encryption on this CPU. > + */ > + local_irq_save(flags); > + > st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128]; > st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160]; > st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192]; > @@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4); > put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8); > put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12); > + > + local_irq_restore(flags); > } > > static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > @@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > const u32 *rkp = ctx->key_dec + 4; > int rounds = 6 + ctx->key_length / 4; > u32 st0[4], st1[4]; > + unsigned long flags; > int round; > > st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in); > @@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8); > st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12); > > + /* > + * Disable interrupts (including preemption) while the sbox is loaded > + * into L1 cache and used for decryption on this CPU. > + */ > + local_irq_save(flags); > + > st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128]; > st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160]; > st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192]; > @@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) > put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4); > put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8); > put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12); > + > + local_irq_restore(flags); > } > > static struct crypto_alg aes_alg = { > -- > 2.19.0 >