According to Jean-Philippe Aumasson in his paper "Too Much Crypto" [1]: > "The best result on ChaCha is a key recovery attack on the 7-round version > with 2^237.7 time complexity using output data from 2^96 instances of ChaCha, > that is, 2^105 bytes of data." He then proposes that ChaCha use 8 rounds instead of 20, providing a 2.5x speed-up. As such, this patch adds chacha8_block and chacha12_block and switches the RNG from ChaCha20 to ChaCha8 to take advantage of that efficiency without sacrificing security. [1]: https://eprint.iacr.org/2019/1492 On my ThinkPad T480s with an Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz, the speed-up is close to what would be expected. Without the patch: $ dd if=/dev/urandom of=/dev/null bs=32M count=300 300+0 records in 300+0 records out 10066329600 bytes (10 GB, 9.4 GiB) copied, 20.4806 s, 492 MB/s With the patch: $ dd if=/dev/urandom of=/dev/null bs=32M count=300 300+0 records in 300+0 records out 10066329600 bytes (10 GB, 9.4 GiB) copied, 11.5321 s, 873 MB/s Signed-off-by: Aaron Toponce <aaron.toponce@xxxxxxxxx> --- drivers/char/random.c | 8 ++++---- include/crypto/chacha.h | 14 ++++++++++++-- lib/crypto/chacha.c | 6 +++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 2597cb43f438..2e14a30b795f 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -302,7 +302,7 @@ static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE], chacha_init_consts(chacha_state); memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE); memset(&chacha_state[12], 0, sizeof(u32) * 4); - chacha20_block(chacha_state, first_block); + chacha8_block(chacha_state, first_block); memcpy(key, first_block, CHACHA_KEY_SIZE); memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len); @@ -388,13 +388,13 @@ static void _get_random_bytes(void *buf, size_t len) while (len) { if (len < CHACHA_BLOCK_SIZE) { - chacha20_block(chacha_state, tmp); + chacha8_block(chacha_state, tmp); memcpy(buf, tmp, len); memzero_explicit(tmp, sizeof(tmp)); break; } - chacha20_block(chacha_state, buf); + chacha8_block(chacha_state, buf); if (unlikely(chacha_state[12] == 0)) ++chacha_state[13]; len -= CHACHA_BLOCK_SIZE; @@ -444,7 +444,7 @@ static ssize_t get_random_bytes_user(struct iov_iter *iter) } for (;;) { - chacha20_block(chacha_state, block); + chacha8_block(chacha_state, block); if (unlikely(chacha_state[12] == 0)) ++chacha_state[13]; diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index b3ea73b81944..64c45121c69a 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -8,8 +8,7 @@ * * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is * recommended to use the 20-round variant ChaCha20. However, the other - * variants can be needed in some performance-sensitive scenarios. The generic - * ChaCha code currently allows only the 20 and 12-round variants. + * variants can be needed in some performance-sensitive scenarios. */ #ifndef _CRYPTO_CHACHA_H @@ -31,11 +30,22 @@ #define XCHACHA_IV_SIZE 32 void chacha_block_generic(u32 *state, u8 *stream, int nrounds); + static inline void chacha20_block(u32 *state, u8 *stream) { chacha_block_generic(state, stream, 20); } +static inline void chacha12_block(u32 *state, u8 *stream) +{ + chacha_block_generic(state, stream, 12); +} + +static inline void chacha8_block(u32 *state, u8 *stream) +{ + chacha_block_generic(state, stream, 8); +} + void hchacha_block_arch(const u32 *state, u32 *out, int nrounds); void hchacha_block_generic(const u32 *state, u32 *out, int nrounds); diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index b748fd3d256e..15e773629f1d 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -18,7 +18,7 @@ static void chacha_permute(u32 *x, int nrounds) int i; /* whitelist the allowed round counts */ - WARN_ON_ONCE(nrounds != 20 && nrounds != 12); + WARN_ON_ONCE(nrounds != 20 && nrounds != 12 && nrounds != 8); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); @@ -67,7 +67,7 @@ static void chacha_permute(u32 *x, int nrounds) * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) - * @nrounds: number of rounds (20 or 12; 20 is recommended) + * @nrounds: number of rounds (20, 12, or 8; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function @@ -93,7 +93,7 @@ EXPORT_SYMBOL(chacha_block_generic); * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix (16 32-bit words) * @stream: output (8 32-bit words) - * @nrounds: number of rounds (20 or 12; 20 is recommended) + * @nrounds: number of rounds (20, 12, or 8; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha -- 2.43.0