The S array does not need to be u32, u8 is enough On machine which have efficient unaligned access, use u8 to save some memory. So, provide a version optimized for memory usage in such a case. Based on my testing, the size of arc4_ctx is: u8 version: 264 u32 version: 1032 On my machine, the u8 version is about 5% faster. It save ~800 bytes of memory or stack depending on how arc4_ctx is stored. It is likely to be more cache friendly. It has been tested an Core i7-3770 with the following test program: #include <stdlib.h> #include <stdio.h> #include <string.h> #define u8 unsigned char #define u32 unsigned int #define true 1 struct arc4_ctx_8 { u8 S[256]; u32 x, y; }; struct arc4_ctx_32 { u32 S[256]; u32 x, y; }; #define S_type u8 int arc4_setkey_8(struct arc4_ctx_8 *ctx, const u8 *in_key, unsigned int key_len) { int i, j = 0, k = 0; ctx->x = 1; ctx->y = 0; for (i = 0; i < 256; i++) ctx->S[i] = i; for (i = 0; i < 256; i++) { S_type a = ctx->S[i]; j = (j + in_key[k] + a) & 0xff; ctx->S[i] = ctx->S[j]; ctx->S[j] = a; if (++k >= key_len) k = 0; } return 0; } void arc4_crypt_8(struct arc4_ctx_8 *ctx, u8 *out, const u8 *in, unsigned int len) { S_type *const S = ctx->S; S_type a, b, ta, tb; u32 x, y, ty; if (len == 0) return; x = ctx->x; y = ctx->y; a = S[x]; y = (y + a) & 0xff; b = S[y]; do { S[y] = a; a = (a + b) & 0xff; S[x] = b; x = (x + 1) & 0xff; ta = S[x]; ty = (y + ta) & 0xff; tb = S[ty]; *out++ = *in++ ^ S[a]; if (--len == 0) break; y = ty; a = ta; b = tb; } while (true); ctx->x = x; ctx->y = y; } #undef S_type #define S_type u32 int arc4_setkey_32(struct arc4_ctx_32 *ctx, const u8 *in_key, unsigned int key_len) { int i, j = 0, k = 0; ctx->x = 1; ctx->y = 0; for (i = 0; i < 256; i++) ctx->S[i] = i; for (i = 0; i < 256; i++) { S_type a = ctx->S[i]; j = (j + in_key[k] + a) & 0xff; ctx->S[i] = ctx->S[j]; ctx->S[j] = a; if (++k >= key_len) k = 0; } return 0; } void arc4_crypt_32(struct arc4_ctx_32 *ctx, u8 *out, const u8 *in, unsigned int len) { S_type *const S = ctx->S; S_type a, b, ta, tb; u32 x, y, ty; if (len == 0) return; x = ctx->x; y = ctx->y; a = S[x]; y = (y + a) & 0xff; b = S[y]; do { S[y] = a; a = (a + b) & 0xff; S[x] = b; x = (x + 1) & 0xff; ta = S[x]; ty = (y + ta) & 0xff; tb = S[ty]; *out++ = *in++ ^ S[a]; if (--len == 0) break; y = ty; a = ta; b = tb; } while (true); ctx->x = x; ctx->y = y; } #define KEY "AZERTY" #define in "AZERTYUIOP_QSDFGHJKLM_WXCVBN" int main() { long i; struct arc4_ctx_8 ctx_8; u8 out8[1024] = { }; struct arc4_ctx_32 ctx_32; u8 out32[1024] = { }; arc4_setkey_8(&ctx_8, KEY, strlen(KEY)); arc4_crypt_8(&ctx_8, out8, in, strlen(in)); arc4_setkey_32(&ctx_32, KEY, strlen(KEY)); arc4_crypt_32(&ctx_32, out32, in, strlen(in)); printf("%ld vs %ld\n", sizeof(ctx_8), sizeof(ctx_32)); if (memcmp(out8, out32, 1024) == 0) printf("Ok\n"); else printf("Broken\n"); return 0; } Signed-off-by: Christophe JAILLET <christophe.jaillet@xxxxxxxxxx> --- The idea came from code found in staging/rtl8712/ See at the top of: https://elixir.bootlin.com/linux/v5.12/source/drivers/staging/rtl8712/rtl871x_security.c More precisely, in an attempt to clean staging/rtl8712/, I triggered the kernel test robot about some increasing stack usage: https://lore.kernel.org/kernel-janitors/YHQUH+Nqc%2FzS14Tb@xxxxxxxxx/T/#m832a01a9d1517e7efc4f671ed46deae9993d6ae9 The above patch works for me, but should be taken as a RFC. --- include/crypto/arc4.h | 8 +++++++- lib/crypto/arc4.c | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/crypto/arc4.h b/include/crypto/arc4.h index f3c22fe01704..39545ed486e2 100644 --- a/include/crypto/arc4.h +++ b/include/crypto/arc4.h @@ -12,8 +12,14 @@ #define ARC4_MAX_KEY_SIZE 256 #define ARC4_BLOCK_SIZE 1 +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define S_type u8 +#else +#define S_type u32 +#endif + struct arc4_ctx { - u32 S[256]; + S_type S[256]; u32 x, y; }; diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c index c2020f19c652..e0be0c2a08d9 100644 --- a/lib/crypto/arc4.c +++ b/lib/crypto/arc4.c @@ -21,7 +21,7 @@ int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) ctx->S[i] = i; for (i = 0; i < 256; i++) { - u32 a = ctx->S[i]; + S_type a = ctx->S[i]; j = (j + in_key[k] + a) & 0xff; ctx->S[i] = ctx->S[j]; @@ -36,9 +36,9 @@ EXPORT_SYMBOL(arc4_setkey); void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) { - u32 *const S = ctx->S; - u32 x, y, a, b; - u32 ty, ta, tb; + S_type *const S = ctx->S; + S_type a, b, ta, tb; + u32 x, y, ty; if (len == 0) return; -- 2.30.2