On Sat, Feb 15, 2025 at 07:38:16PM -0800, Eric Biggers wrote: > > This new version hasn't fundamentally changed anything. It's still a much > worse, unnecessarily complex and still incomplete implementation compared to my > patchset which has been ready to go for nearly a year already. Please refer to > all the previous feedback that I've given. FWIW, my interface is a lot simpler than yours to implement, since it doesn't deal with the partial buffer non-sense in assembly. In fact that was a big mistake with the original API, the partial data handling should've been moved to the API layer a long time ago. Here is the result for sha256-ni-mb with my code: testing speed of multibuffer sha256 (sha256-ni-mb) [ 73.212300] tcrypt: test 0 ( 16 byte blocks, 16 bytes per update, 1 updates): 1 operation in 284 cycles (16 bytes) [ 73.212805] tcrypt: test 2 ( 64 byte blocks, 64 bytes per update, 1 updates): 1 operation in 311 cycles (64 bytes) [ 73.213256] tcrypt: test 5 ( 256 byte blocks, 256 bytes per update, 1 updates): 1 operation in 481 cycles (256 bytes) [ 73.213715] tcrypt: test 8 ( 1024 byte blocks, 1024 bytes per update, 1 updates): 1 operation in 1209 cycles (1024 bytes) [ 73.214181] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update, 1 updates): 1 operation in 36107 cycles (2048 bytes) [ 73.214904] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update, 1 updates): 1 operation in 4097 cycles (4096 bytes) [ 73.215416] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update, 1 updates): 1 operation in 7991 cycles (8192 bytes) [ 86.522453] tcrypt: testing speed of multibuffer sha256 (sha256-ni) [ 86.522737] tcrypt: test 0 ( 16 byte blocks, 16 bytes per update, 1 updates): 1 operation in 224 cycles (16 bytes) [ 86.523164] tcrypt: test 2 ( 64 byte blocks, 64 bytes per update, 1 updates): 1 operation in 296 cycles (64 bytes) [ 86.523586] tcrypt: test 5 ( 256 byte blocks, 256 bytes per update, 1 updates): 1 operation in 531 cycles (256 bytes) [ 86.524012] tcrypt: test 8 ( 1024 byte blocks, 1024 bytes per update, 1 updates): 1 operation in 1466 cycles (1024 bytes) [ 86.524602] tcrypt: test 12 ( 2048 byte blocks, 2048 bytes per update, 1 updates): 1 operation in 2911 cycles (2048 bytes) [ 86.525199] tcrypt: test 16 ( 4096 byte blocks, 4096 bytes per update, 1 updates): 1 operation in 5566 cycles (4096 bytes) [ 86.525806] tcrypt: test 21 ( 8192 byte blocks, 8192 bytes per update, 1 updates): 1 operation in 10901 cycles (8192 bytes) So about a 36% jump in throughput at 4K on my aging Intel CPU. Cheers, diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index d515a55a3bc1..10771d957116 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -174,6 +174,216 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) RET SYM_FUNC_END(sha256_ni_transform) +#undef DIGEST_PTR +#undef DATA_PTR +#undef NUM_BLKS +#undef SHA256CONSTANTS +#undef MSG +#undef STATE0 +#undef STATE1 +#undef MSG0 +#undef MSG1 +#undef MSG2 +#undef MSG3 +#undef TMP +#undef SHUF_MASK +#undef ABEF_SAVE +#undef CDGH_SAVE + +// parameters for sha256_ni_x2() +#define MBCTX %rdi +#define BLOCKS %rsi +#define DATA1 %rdx +#define DATA2 %rcx + +// other scalar variables +#define SHA256CONSTANTS %rax +#define COUNT %r10 +#define COUNT32 %r10d + +// rbx is used as a temporary. + +#define MSG %xmm0 // sha256rnds2 implicit operand +#define STATE0_A %xmm1 +#define STATE1_A %xmm2 +#define STATE0_B %xmm3 +#define STATE1_B %xmm4 +#define TMP_A %xmm5 +#define TMP_B %xmm6 +#define MSG0_A %xmm7 +#define MSG1_A %xmm8 +#define MSG2_A %xmm9 +#define MSG3_A %xmm10 +#define MSG0_B %xmm11 +#define MSG1_B %xmm12 +#define MSG2_B %xmm13 +#define MSG3_B %xmm14 +#define SHUF_MASK %xmm15 + +#define OFFSETOF_STATEA 0 // offsetof(struct sha256_x2_mbctx, state[0]) +#define OFFSETOF_STATEB 32 // offsetof(struct sha256_x2_mbctx, state[1]) +#define OFFSETOF_INPUT0 64 // offsetof(struct sha256_x2_mbctx, input[0]) +#define OFFSETOF_INPUT1 72 // offsetof(struct sha256_x2_mbctx, input[1]) + +// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b +// contain the current 4 message schedule words for the first and second message +// respectively. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words for each message. m1_a-m3_a contain +// the next 3 groups of 4 message schedule words for the first message, and +// likewise m1_b-m3_b for the second. After consuming the current value of +// m0_a, this macro computes the group after m3_a and writes it to m0_a, and +// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the +// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must +// cycle through the registers accordingly. +.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A + movdqa TMP_A, TMP_B + paddd \m0_a, TMP_A + paddd \m0_b, TMP_B +.if \i < 48 + sha256msg1 \m1_a, \m0_a + sha256msg1 \m1_b, \m0_b +.endif + movdqa TMP_A, MSG + sha256rnds2 STATE0_A, STATE1_A + movdqa TMP_B, MSG + sha256rnds2 STATE0_B, STATE1_B + pshufd $0x0E, TMP_A, MSG + sha256rnds2 STATE1_A, STATE0_A + pshufd $0x0E, TMP_B, MSG + sha256rnds2 STATE1_B, STATE0_B +.if \i < 48 + movdqa \m3_a, TMP_A + movdqa \m3_b, TMP_B + palignr $4, \m2_a, TMP_A + palignr $4, \m2_b, TMP_B + paddd TMP_A, \m0_a + paddd TMP_B, \m0_b + sha256msg2 \m3_a, \m0_a + sha256msg2 \m3_b, \m0_b +.endif +.endm + +// +// void sha256_ni_x2(struct sha256_x2_mbctx *mbctx, int blocks) +// +// This function computes the SHA-256 digests of two messages that are +// both |blocks| blocks long, starting from the individual initial states +// in |mbctx|. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ni_x2) + // Allocate 64 bytes of stack space, 16-byte aligned. + push %rbx + push %rbp + mov %rsp, %rbp + sub $64, %rsp + and $~15, %rsp + + // Load the shuffle mask for swapping the endianness of 32-bit words. + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + // Set up pointer to the round constants. + lea K256+32*4(%rip), SHA256CONSTANTS + + // Load the initial state from sctx->state. + movdqu OFFSETOF_STATEA+0*16(MBCTX), STATE0_A // DCBA + movdqu OFFSETOF_STATEA+1*16(MBCTX), STATE1_A // HGFE + movdqu OFFSETOF_STATEB+0*16(MBCTX), STATE0_B // DCBA + movdqu OFFSETOF_STATEB+1*16(MBCTX), STATE1_B // HGFE + + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // FEBA + punpcklqdq STATE1_B, STATE0_B // FEBA + punpckhqdq TMP_A, STATE1_A // DCHG + punpckhqdq TMP_B, STATE1_B // DCHG + pshufd $0x1B, STATE0_A, STATE0_A // ABEF + pshufd $0x1B, STATE0_B, STATE0_B // ABEF + pshufd $0xB1, STATE1_A, STATE1_A // CDGH + pshufd $0xB1, STATE1_B, STATE1_B // CDGH + + mov OFFSETOF_INPUT0+0(MBCTX),DATA1 + mov OFFSETOF_INPUT1+0(MBCTX),DATA2 + +.Lfinup2x_loop: + // Load the next two data blocks. + movdqu 0*16(DATA1), MSG0_A + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA1), MSG1_A + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA1), MSG2_A + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA1), MSG3_A + movdqu 3*16(DATA2), MSG3_B + add $64, DATA1 + add $64, DATA2 + + // Convert the words of the data blocks from big endian. + pshufb SHUF_MASK, MSG0_A + pshufb SHUF_MASK, MSG0_B + pshufb SHUF_MASK, MSG1_A + pshufb SHUF_MASK, MSG1_B + pshufb SHUF_MASK, MSG2_A + pshufb SHUF_MASK, MSG2_B + pshufb SHUF_MASK, MSG3_A + pshufb SHUF_MASK, MSG3_B + + // Save the original state for each block. + movdqa STATE0_A, 0*16(%rsp) + movdqa STATE0_B, 1*16(%rsp) + movdqa STATE1_A, 2*16(%rsp) + movdqa STATE1_B, 3*16(%rsp) + + // Do the SHA-256 rounds on each block. +.irp i, 0, 16, 32, 48 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ + MSG0_B, MSG1_B, MSG2_B, MSG3_B + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ + MSG1_B, MSG2_B, MSG3_B, MSG0_B + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ + MSG2_B, MSG3_B, MSG0_B, MSG1_B + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ + MSG3_B, MSG0_B, MSG1_B, MSG2_B +.endr + + // Add the original state for each block. + paddd 0*16(%rsp), STATE0_A + paddd 1*16(%rsp), STATE0_B + paddd 2*16(%rsp), STATE1_A + paddd 3*16(%rsp), STATE1_B + + // Update BLOCKS and loop back if more blocks remain. + sub $1, BLOCKS + jne .Lfinup2x_loop + + // Write the two digests with all bytes in the correct order. + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // GHEF + punpcklqdq STATE1_B, STATE0_B + punpckhqdq TMP_A, STATE1_A // ABCD + punpckhqdq TMP_B, STATE1_B + pshufd $0xB1, STATE0_A, STATE0_A // HGFE + pshufd $0xB1, STATE0_B, STATE0_B + pshufd $0x1B, STATE1_A, STATE1_A // DCBA + pshufd $0x1B, STATE1_B, STATE1_B + movdqu STATE0_A, OFFSETOF_STATEA+1*16(MBCTX) + movdqu STATE0_B, OFFSETOF_STATEB+1*16(MBCTX) + movdqu STATE1_A, OFFSETOF_STATEA+0*16(MBCTX) + movdqu STATE1_B, OFFSETOF_STATEB+0*16(MBCTX) + + mov %rbp, %rsp + pop %rbp + pop %rbx + RET +SYM_FUNC_END(sha256_ni_x2) + .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index e634b89a5123..d578fd98a0d6 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -41,6 +41,11 @@ #include <asm/cpu_device_id.h> #include <asm/simd.h> +struct sha256_x2_mbctx { + u32 state[2][8]; + const u8 *input[2]; +}; + struct sha256_x8_mbctx { u32 state[8][8]; const u8 *input[8]; @@ -558,7 +563,102 @@ static int sha256_mb_next(struct ahash_request *req, unsigned int len, return sha256_mb_fill(req, final); } -static struct ahash_request *sha256_update_x8x1( +static struct ahash_request *sha256_update_x1_post( + struct list_head *list, struct ahash_request *r2, + struct ahash_request **reqs, int width, + unsigned int len, bool nodata, bool final) +{ + int i = 0; + + do { + struct sha256_reqctx *rctx = ahash_request_ctx(reqs[i]); + + rctx->next = sha256_mb_next(reqs[i], len, final); + + if (rctx->next) { + if (++i >= width) + break; + continue; + } + + if (i < width - 1 && reqs[i + 1]) { + memmove(reqs + i, reqs + i + 1, + sizeof(r2) * (width - i - 1)); + reqs[width - 1] = NULL; + continue; + } + + reqs[i] = NULL; + + do { + while (!list_is_last(&r2->base.list, list)) { + r2 = list_next_entry(r2, base.list); + r2->base.err = 0; + + rctx = ahash_request_ctx(r2); + rctx->next = sha256_mb_start(r2, nodata, final); + if (rctx->next) { + reqs[i] = r2; + break; + } + } + } while (reqs[i] && ++i < width); + + break; + } while (reqs[i]); + + return r2; +} + +static int sha256_chain_pre(struct ahash_request **reqs, int width, + struct ahash_request *req, + bool nodata, bool final) +{ + struct sha256_reqctx *rctx = ahash_request_ctx(req); + struct ahash_request *r2; + int i; + + req->base.err = 0; + reqs[0] = req; + rctx->next = sha256_mb_start(req, nodata, final); + i = !!rctx->next; + list_for_each_entry(r2, &req->base.list, base.list) { + struct sha256_reqctx *r2ctx = ahash_request_ctx(r2); + + r2->base.err = 0; + + r2ctx = ahash_request_ctx(r2); + r2ctx->next = sha256_mb_start(r2, nodata, final); + if (!r2ctx->next) + continue; + + reqs[i++] = r2; + if (i >= width) + break; + } + + return i; +} + +static void sha256_chain_post(struct ahash_request *req, bool final) +{ + struct sha256_reqctx *rctx = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + unsigned int ds = crypto_ahash_digestsize(tfm); + struct ahash_request *r2; + + if (!final) + return; + + lib_sha256_base_finish(&rctx->state, req->result, ds); + list_for_each_entry(r2, &req->base.list, base.list) { + struct sha256_reqctx *r2ctx = ahash_request_ctx(r2); + + lib_sha256_base_finish(&r2ctx->state, r2->result, ds); + } +} + +static struct ahash_request *sha256_avx2_update_x8x1( struct list_head *list, struct ahash_request *r2, struct ahash_request *reqs[8], bool nodata, bool final) { @@ -613,97 +713,30 @@ static struct ahash_request *sha256_update_x8x1( } done: - i = 0; - do { - struct sha256_reqctx *rctx = ahash_request_ctx(reqs[i]); - - rctx->next = sha256_mb_next(reqs[i], len, final); - - if (rctx->next) { - if (++i >= 8) - break; - continue; - } - - if (i < 7 && reqs[i + 1]) { - memmove(reqs + i, reqs + i + 1, sizeof(r2) * (7 - i)); - reqs[7] = NULL; - continue; - } - - reqs[i] = NULL; - - do { - while (!list_is_last(&r2->base.list, list)) { - r2 = list_next_entry(r2, base.list); - r2->base.err = 0; - - rctx = ahash_request_ctx(r2); - rctx->next = sha256_mb_start(r2, nodata, final); - if (rctx->next) { - reqs[i] = r2; - break; - } - } - } while (reqs[i] && ++i < 8); - - break; - } while (reqs[i]); - - return r2; + return sha256_update_x1_post(list, r2, reqs, 8, len, nodata, final); } -static void sha256_update_x8(struct list_head *list, - struct ahash_request *reqs[8], int i, - bool nodata, bool final) +static void sha256_avx2_update_x8(struct list_head *list, + struct ahash_request *reqs[8], int i, + bool nodata, bool final) { struct ahash_request *r2 = reqs[i - 1]; do { - r2 = sha256_update_x8x1(list, r2, reqs, nodata, final); + r2 = sha256_avx2_update_x8x1(list, r2, reqs, nodata, final); } while (reqs[0]); } -static void sha256_chain(struct ahash_request *req, bool nodata, bool final) +static void sha256_avx2_chain(struct ahash_request *req, bool nodata, bool final) { - struct sha256_reqctx *rctx = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - unsigned int ds = crypto_ahash_digestsize(tfm); struct ahash_request *reqs[8] = {}; - struct ahash_request *r2; - int i; + int blocks; - req->base.err = 0; - reqs[0] = req; - rctx->next = sha256_mb_start(req, nodata, final); - i = !!rctx->next; - list_for_each_entry(r2, &req->base.list, base.list) { - struct sha256_reqctx *r2ctx = ahash_request_ctx(r2); + blocks = sha256_chain_pre(reqs, 8, req, nodata, final); + if (blocks) + sha256_avx2_update_x8(&req->base.list, reqs, blocks, nodata, final); - r2->base.err = 0; - - r2ctx = ahash_request_ctx(r2); - r2ctx->next = sha256_mb_start(r2, nodata, final); - if (!r2ctx->next) - continue; - - reqs[i++] = r2; - if (i >= 8) - break; - } - - if (i) - sha256_update_x8(&req->base.list, reqs, i, nodata, final); - - if (!final) - return; - - lib_sha256_base_finish(&rctx->state, req->result, ds); - list_for_each_entry(r2, &req->base.list, base.list) { - struct sha256_reqctx *r2ctx = ahash_request_ctx(r2); - - lib_sha256_base_finish(&r2ctx->state, r2->result, ds); - } + sha256_chain_post(req, final); } static int sha256_avx2_update_mb(struct ahash_request *req) @@ -712,7 +745,7 @@ static int sha256_avx2_update_mb(struct ahash_request *req) int err; if (ahash_request_chained(req) && crypto_simd_usable()) { - sha256_chain(req, false, false); + sha256_avx2_chain(req, false, false); return 0; } @@ -736,7 +769,7 @@ static int _sha256_avx2_finup(struct ahash_request *req, bool nodata) int err; if (ahash_request_chained(req) && crypto_simd_usable()) { - sha256_chain(req, nodata, true); + sha256_avx2_chain(req, nodata, true); return 0; } @@ -860,6 +893,7 @@ static void unregister_sha256_avx2(void) #ifdef CONFIG_AS_SHA256_NI asmlinkage void sha256_ni_transform(struct sha256_state *digest, const u8 *data, int rounds); +asmlinkage void sha256_ni_x2(struct sha256_x2_mbctx *mbctx, int blocks); static int sha256_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) @@ -916,19 +950,207 @@ static struct shash_alg sha256_ni_algs[] = { { } } }; +static struct ahash_request *sha256_ni_update_x2x1( + struct list_head *list, struct ahash_request *r2, + struct ahash_request *reqs[2], bool nodata, bool final) +{ + struct sha256_state *states[2]; + struct sha256_x2_mbctx mbctx; + unsigned int len = 0; + int i = 0; + + do { + struct sha256_reqctx *rctx = ahash_request_ctx(reqs[i]); + unsigned int nbytes; + + nbytes = rctx->next; + if (!i || nbytes < len) + len = nbytes; + + states[i] = &rctx->state; + memcpy(mbctx.state[i], states[i], 32); + mbctx.input[i] = rctx->input; + } while (++i < 2 && reqs[i]); + + len &= ~(SHA256_BLOCK_SIZE - 1); + + if (i < 2) { + sha256_ni_transform(states[0], mbctx.input[0], + len / SHA256_BLOCK_SIZE); + goto done; + } + + sha256_ni_x2(&mbctx, len / SHA256_BLOCK_SIZE); + + for (i = 0; i < 2; i++) + memcpy(states[i], mbctx.state[i], 32); + +done: + return sha256_update_x1_post(list, r2, reqs, 2, len, nodata, final); +} + +static void sha256_ni_update_x2(struct list_head *list, + struct ahash_request *reqs[2], int i, + bool nodata, bool final) +{ + struct ahash_request *r2 = reqs[i - 1]; + + do { + r2 = sha256_ni_update_x2x1(list, r2, reqs, nodata, final); + } while (reqs[0]); +} + +static void sha256_ni_chain(struct ahash_request *req, bool nodata, bool final) +{ + struct ahash_request *reqs[2] = {}; + int blocks; + + blocks = sha256_chain_pre(reqs, 2, req, nodata, final); + if (blocks) + sha256_ni_update_x2(&req->base.list, reqs, blocks, nodata, final); + + sha256_chain_post(req, final); +} + +static int sha256_ni_update_mb(struct ahash_request *req) +{ + struct ahash_request *r2; + int err; + + if (ahash_request_chained(req) && crypto_simd_usable()) { + sha256_ni_chain(req, false, false); + return 0; + } + + err = sha256_ahash_update(req, sha256_ni_transform); + if (!ahash_request_chained(req)) + return err; + + req->base.err = err; + + list_for_each_entry(r2, &req->base.list, base.list) { + err = sha256_ahash_update(r2, sha256_ni_transform); + r2->base.err = err; + } + + return 0; +} + +static int _sha256_ni_finup(struct ahash_request *req, bool nodata) +{ + struct ahash_request *r2; + int err; + + if (ahash_request_chained(req) && crypto_simd_usable()) { + sha256_ni_chain(req, nodata, true); + return 0; + } + + err = sha256_ahash_finup(req, nodata, sha256_ni_transform); + if (!ahash_request_chained(req)) + return err; + + req->base.err = err; + + list_for_each_entry(r2, &req->base.list, base.list) { + err = sha256_ahash_finup(r2, nodata, sha256_ni_transform); + r2->base.err = err; + } + + return 0; +} + +static int sha256_ni_finup_mb(struct ahash_request *req) +{ + return _sha256_ni_finup(req, false); +} + +static int sha256_ni_final_mb(struct ahash_request *req) +{ + return _sha256_ni_finup(req, true); +} + +static int sha256_ni_digest_mb(struct ahash_request *req) +{ + return sha256_ahash_init(req) ?: + sha256_ni_finup_mb(req); +} + +static int sha224_ni_digest_mb(struct ahash_request *req) +{ + return sha224_ahash_init(req) ?: + sha256_ni_finup_mb(req); +} + +static struct ahash_alg sha256_ni_mb_algs[] = { { + .halg.digestsize = SHA256_DIGEST_SIZE, + .halg.statesize = sizeof(struct sha256_state), + .reqsize = sizeof(struct sha256_reqctx), + .init = sha256_ahash_init, + .update = sha256_ni_update_mb, + .final = sha256_ni_final_mb, + .finup = sha256_ni_finup_mb, + .digest = sha256_ni_digest_mb, + .import = sha256_import, + .export = sha256_export, + .halg.base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ni-mb", + .cra_priority = 260, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_flags = CRYPTO_ALG_REQ_CHAIN, + } +}, { + .halg.digestsize = SHA224_DIGEST_SIZE, + .halg.statesize = sizeof(struct sha256_state), + .reqsize = sizeof(struct sha256_reqctx), + .init = sha224_ahash_init, + .update = sha256_ni_update_mb, + .final = sha256_ni_final_mb, + .finup = sha256_ni_finup_mb, + .digest = sha224_ni_digest_mb, + .import = sha256_import, + .export = sha256_export, + .halg.base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ni-mb", + .cra_priority = 260, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_flags = CRYPTO_ALG_REQ_CHAIN, + } +} }; + static int register_sha256_ni(void) { - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); - return 0; + int err; + + if (!boot_cpu_has(X86_FEATURE_SHA_NI)) + return 0; + + err = crypto_register_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); + if (err) + return err; + + err = crypto_register_ahashes(sha256_ni_mb_algs, + ARRAY_SIZE(sha256_ni_mb_algs)); + if (err) + crypto_unregister_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); + + return err; } static void unregister_sha256_ni(void) { - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); + if (!boot_cpu_has(X86_FEATURE_SHA_NI)) + return; + + crypto_unregister_ahashes(sha256_ni_mb_algs, + ARRAY_SIZE(sha256_ni_mb_algs)); + crypto_unregister_shashes(sha256_ni_algs, ARRAY_SIZE(sha256_ni_algs)); } #else -- Email: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt