From: Eric Biggers <ebiggers@xxxxxxxxxx> - Load the SHA-256 round constants relative to a pointer that points into the middle of the constants rather than to the beginning. Since x86 instructions use signed offsets, this decreases the instruction length required to access some of the later round constants. - Use punpcklqdq or punpckhqdq instead of longer instructions such as pshufd, pblendw, and palignr. This doesn't harm performance. The end result is that sha256_ni_transform shrinks from 839 bytes to 791 bytes, with no loss in performance. Suggested-by: Stefan Kanthak <stefan.kanthak@xxxxxxxx> Signed-off-by: Eric Biggers <ebiggers@xxxxxxxxxx> --- arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index e485520e3b49..4d373069448d 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -82,19 +82,19 @@ pshufb SHUF_MASK, MSG movdqa MSG, \m0 .else movdqa \m0, MSG .endif - paddd \i*4(SHA256CONSTANTS), MSG + paddd (\i-32)*4(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 .if \i >= 12 && \i < 60 movdqa \m0, MSGTMP4 palignr $4, \m3, MSGTMP4 paddd MSGTMP4, \m1 sha256msg2 \m0, \m1 .endif - pshufd $0x0E, MSG, MSG + punpckhqdq MSG, MSG sha256rnds2 STATE1, STATE0 .if \i >= 4 && \i < 52 sha256msg1 \m0, \m3 .endif .endm @@ -133,21 +133,21 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) /* * load initial hash values * Need to reorder these appropriately * DCBA, HGFE -> ABEF, CDGH */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 + movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ + movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq MSGTMP4, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS + lea K256+32*4(%rip), SHA256CONSTANTS .Lloop0: /* Save hash values for addition after rounds */ movdqa STATE0, ABEF_SAVE movdqa STATE1, CDGH_SAVE @@ -165,18 +165,18 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) add $64, DATA_PTR cmp NUM_BLKS, DATA_PTR jne .Lloop0 /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq MSGTMP4, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) + movdqu STATE1, 0*16(DIGEST_PTR) + movdqu STATE0, 1*16(DIGEST_PTR) .Ldone_hash: RET SYM_FUNC_END(sha256_ni_transform) -- 2.44.0