Looking for comments on if this is a good idea to add SHA-NI option to git, And if so, what'd be a good implementation. The attached patch is more of a proof of concept, using a sha-ni example found on internet and have it tapped into git's "hash-ll.h" when it sees make flags "SHA1_SHANI=1" and/or "SHA256_SHANI=1" for sha1/sha256 respectively. "git hash-object" is about 3-ish times faster for me There are few topics that i'm uncertain about. 1. Is it good idea to have machine-specific asm codes in git. I read there was commit fd1ec82547 which removed assembly implementation for PPC_SHA1. Does that imply maintainers doesn't want machine-specific assembly in source? 2. If it is cool to have machine-specific codes in git, what's a good logic for their on/off? Build time flag? Run time flag? Run time automatic detection to corresponding instructions? 3. If i can include machine-specific codes, what'd be a good way/idea to cleanup the asm? I suppose git maintainers do not want someone else's license header? Regards, Feiyang --- Makefile | 28 ++ hash-ll.h | 8 +- .../intel_sha_extensions_sha1_assembly.asm | 308 +++++++++++++++ sha1-x64shani/sha1_x64.c | 71 ++++ sha1-x64shani/sha1_x64.h | 21 + .../intel_sha_extensions_sha256_assembly.asm | 371 ++++++++++++++++++ sha256-x64shani/sha256_x64.c | 82 ++++ sha256-x64shani/sha256_x64.h | 23 ++ 8 files changed, 910 insertions(+), 2 deletions(-) create mode 100644 sha1-x64shani/intel_sha_extensions_sha1_assembly.asm create mode 100644 sha1-x64shani/sha1_x64.c create mode 100644 sha1-x64shani/sha1_x64.h create mode 100644 sha256-x64shani/intel_sha_extensions_sha256_assembly.asm create mode 100644 sha256-x64shani/sha256_x64.c create mode 100644 sha256-x64shani/sha256_x64.h diff --git a/Makefile b/Makefile index e440728c24..6892af2ee5 100644 --- a/Makefile +++ b/Makefile @@ -1917,6 +1917,11 @@ ifdef PPC_SHA1 $(error the PPC_SHA1 flag has been removed along with the PowerPC-specific SHA-1 implementation.) endif +ifdef SHA1_SHANI + LIB_OBJS += sha1-x64shani/sha1_x64.o + LIB_OBJS += sha1-x64shani/intel_sha_extensions_sha1_assembly.o + BASIC_CFLAGS += -DSHA1_SHANI +else ifdef OPENSSL_SHA1 EXTLIBS += $(LIB_4_CRYPTO) BASIC_CFLAGS += -DSHA1_OPENSSL @@ -1957,7 +1962,13 @@ endif endif endif endif +endif +ifdef SHA256_SHANI + LIB_OBJS += sha256-x64shani/sha256_x64.o + LIB_OBJS += sha256-x64shani/intel_sha_extensions_sha256_assembly.o + BASIC_CFLAGS += -DSHA256_SHANI +else ifdef OPENSSL_SHA256 EXTLIBS += $(LIB_4_CRYPTO) BASIC_CFLAGS += -DSHA256_OPENSSL @@ -1975,6 +1986,9 @@ else endif endif endif +endif + +$(info $$LIB_OBJS is [${LIB_OBJS}]) ifdef SHA1_MAX_BLOCK_SIZE LIB_OBJS += compat/sha1-chunked.o @@ -2711,6 +2725,20 @@ missing_compdb_dir = compdb_args = endif +$(info $$OBJECTS is [${OBJECTS}]) + +### for SHA-NI +YASM_SRC := $(wildcard $(OBJECTS:o=asm)) +YASM_OBJ := $(YASM_SRC:asm=o) +OBJECTS := $(filter-out $(YASM_OBJ),$(OBJECTS)) + +$(info $$YASM_SRC is [${YASM_SRC}]) +$(info $$YASM_OBJ is [${YASM_OBJ}]) +$(info $$OBJECTS is [${OBJECTS}]) + +$(YASM_OBJ): %.o: %.asm $(missing_dep_dirs) + $(QUIET_CC)yasm $< -f elf64 -X gnu -g dwarf2 -o $@ + $(OBJECTS): %.o: %.c GIT-CFLAGS $(missing_dep_dirs) $(missing_compdb_dir) $(QUIET_CC)$(CC) -o $*.o -c $(dep_args) $(compdb_args) $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $< diff --git a/hash-ll.h b/hash-ll.h index 8050925137..31d9e91f26 100644 --- a/hash-ll.h +++ b/hash-ll.h @@ -1,7 +1,9 @@ #ifndef HASH_LL_H #define HASH_LL_H -#if defined(SHA1_APPLE) +#if defined(SHA1_SHANI) +#include "sha1-x64shani/sha1_x64.h" +#elif defined(SHA1_APPLE) #include <CommonCrypto/CommonDigest.h> #elif defined(SHA1_OPENSSL) #include <openssl/sha.h> @@ -11,7 +13,9 @@ #include "block-sha1/sha1.h" #endif -#if defined(SHA256_NETTLE) +#if defined(SHA256_SHANI) +#include "sha256-x64shani/sha256_x64.h" +#elif defined(SHA256_NETTLE) #include "sha256/nettle.h" #elif defined(SHA256_GCRYPT) #define SHA256_NEEDS_CLONE_HELPER diff --git a/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm b/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm new file mode 100644 index 0000000000..ba59825966 --- /dev/null +++ b/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm @@ -0,0 +1,308 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2013, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Intel SHA Extensions optimized implementation of a SHA-1 update function +; +; The function takes a pointer to the current hash values, a pointer to the +; input data, and a number of 64 byte blocks to process. Once all blocks have +; been processed, the digest pointer is updated with the resulting hash value. +; The function only processes complete blocks, there is no functionality to +; store partial blocks. All message padding and hash value initialization must +; be done outside the update function. +; +; The indented lines in the loop are instructions related to rounds processing. +; The non-indented lines are instructions related to the message schedule. +; +; Author: Sean Gulley <sean.m.gulley@xxxxxxxxx> +; Date: July 2013 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: +; yasm -Xvc -f x64 -rnasm -pnasm -DWINABI -o intel_sha_extensions_sha1_assembly.obj -g cv8 intel_sha_extensions_sha1_assembly.asm +; Linux: +; yasm -f elf64 -X gnu -g dwarf2 -o intel_sha_extensions_sha1_assembly.o intel_sha_extensions_sha1_assembly.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifdef WINABI +%define DIGEST_PTR rcx ; 1st arg +%define DATA_PTR rdx ; 2nd arg +%define NUM_BLKS r8 ; 3rd arg +%else +%define DIGEST_PTR rdi ; 1st arg +%define DATA_PTR rsi ; 2nd arg +%define NUM_BLKS rdx ; 3rd arg +%endif + +%define RSPSAVE rax + +struc frame +%ifdef WINABI +.xmm_save: resdq 2 +%endif +.hash_save: resdq 2 +endstruc + +%define ABCD xmm0 +%define E0 xmm1 ; Need two E's b/c they ping pong +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 +%define SHUF_MASK xmm7 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_update(uint32_t *digest, const void *data, uint32_t numBlocks); +;; arg 1 : pointer to digest +;; arg 2 : pointer to input data +;; arg 3 : Number of blocks to process +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text +global sha1_update:function +align 32 +sha1_update: + mov RSPSAVE, rsp + sub rsp, frame_size + and rsp, ~0xF + +%ifdef WINABI + movdqa [rsp + frame.xmm_save + 0*16], xmm6 + movdqa [rsp + frame.xmm_save + 1*16], xmm7 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz .done_hash + add NUM_BLKS, DATA_PTR ; pointer to end of data + + ;; load initial hash values + pinsrd E0, [DIGEST_PTR + 1*16], 3 + movdqu ABCD, [DIGEST_PTR + 0*16] + pand E0, [UPPER_WORD_MASK wrt rip] + pshufd ABCD, ABCD, 0x1B + + movdqa SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + +.loop0: + ;; Save hash values for addition after rounds + movdqa [rsp + frame.hash_save + 0*16], E0 + movdqa [rsp + frame.hash_save + 1*16], ABCD + + ;; Rounds 0-3 + movdqu MSG0, [DATA_PTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + ;; Rounds 4-7 + movdqu MSG1, [DATA_PTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + ;; Rounds 8-11 + movdqu MSG2, [DATA_PTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ;; Rounds 12-15 + movdqu MSG3, [DATA_PTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ;; Rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ;; Rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ;; Rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ;; Rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ;; Rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ;; Rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ;; Rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ;; Rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ;; Rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ;; Rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ;; Rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ;; Rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ;; Rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ;; Rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + ;; Rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + ;; Rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + ;; Add current hash values with previously saved + sha1nexte E0, [rsp + frame.hash_save + 0*16] + paddd ABCD, [rsp + frame.hash_save + 1*16] + + ;; Increment data pointer and loop if more to process + add DATA_PTR, 64 + cmp DATA_PTR, NUM_BLKS + jne .loop0 + + ;; Write hash values back in the correct order + pshufd ABCD, ABCD, 0x1B + movdqu [DIGEST_PTR + 0*16], ABCD + pextrd [DIGEST_PTR + 1*16], E0, 3 + +.done_hash: +%ifdef WINABI + movdqa xmm6, [rsp + frame.xmm_save + 0*16] + movdqa xmm7, [rsp + frame.xmm_save + 1*16] +%endif + mov rsp, RSPSAVE + + ret + +section .data +align 64 +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x000102030405060708090a0b0c0d0e0f +UPPER_WORD_MASK: ddq 0xFFFFFFFF000000000000000000000000 + diff --git a/sha1-x64shani/sha1_x64.c b/sha1-x64shani/sha1_x64.c new file mode 100644 index 0000000000..0ed0d49ed6 --- /dev/null +++ b/sha1-x64shani/sha1_x64.c @@ -0,0 +1,71 @@ +#include "../git-compat-util.h" +#include "sha1_x64.h" +#include <byteswap.h> + +// static inline void shani_SHA1_Init(shani_SHA_CTX *c); + +void shani_SHA1_Init(shani_SHA_CTX *c) +{ + static const uint32_t sha1InitialDigest[5] = { + 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 + }; + memcpy(c->h, sha1InitialDigest, 20); + c->len = 0; + c->total_len = 0; +} + +void sha1_update(void *c, const void *d, int l); +void shani_SHA1_Update(shani_SHA_CTX *c, const void *data, unsigned long len) +{ + while (len > 0) + { + if (c->len == 0 && len >= 64) + { + sha1_update(c->h, data, len/64); + unsigned long bytes_hashed = (len/64)*64; + data += bytes_hashed; + len -= bytes_hashed; + c->total_len += bytes_hashed; + continue; + } +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + size_t cpy = MIN(64 - c->len, len); +#undef MIN + memcpy(c->msgbuf + c->len, data, cpy); + data += cpy; + len -= cpy; + c->len += cpy; + c->total_len += cpy; + + if (c->len == 64) + { + sha1_update(c->h, c->msgbuf, 1); + c->len = 0; + } + } +} + +void shani_SHA1_Final(unsigned char *md, shani_SHA_CTX *c) +{ + c->msgbuf[c->len] = 0x80; + ++c->len; + memset(c->msgbuf + c->len, 0, 64 - c->len); + if (c->len > (64-8)) + { + sha1_update(c->h, c->msgbuf, 1); + memset(c->msgbuf, 0, 64); + c->len = 0; + } + + for (int ibyte = 0; ibyte < 8; ++ibyte) + c->msgbuf[63-ibyte] = (uint8_t)((c->total_len * 8ULL) >> (ibyte*8)); + + sha1_update(c->h, c->msgbuf, 1); + c->len = 0; + + // flip endianness + int i; + for (i = 0; i < 20; i += 4) { + *(uint32_t *)(md + i) = __bswap_32(c->h[i / 4]); + } +} diff --git a/sha1-x64shani/sha1_x64.h b/sha1-x64shani/sha1_x64.h new file mode 100644 index 0000000000..ca1ec116cf --- /dev/null +++ b/sha1-x64shani/sha1_x64.h @@ -0,0 +1,21 @@ +/* + * SHA1 routine using x64 SHA-NI instructions + */ + +typedef struct shani_SHA_CTX +{ + uint32_t h[5]; + int len; + uint64_t total_len; + char msgbuf[64]; +} shani_SHA_CTX; + + +void shani_SHA1_Init(shani_SHA_CTX *ctx); +void shani_SHA1_Update(shani_SHA_CTX *ctx, const void *dataIn, size_t len); +void shani_SHA1_Final(unsigned char hashout[20], shani_SHA_CTX *ctx); + +#define platform_SHA_CTX shani_SHA_CTX +#define platform_SHA1_Init shani_SHA1_Init +#define platform_SHA1_Update shani_SHA1_Update +#define platform_SHA1_Final shani_SHA1_Final diff --git a/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm b/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm new file mode 100644 index 0000000000..b587e82154 --- /dev/null +++ b/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm @@ -0,0 +1,371 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2013, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Intel SHA Extensions optimized implementation of a SHA-256 update function +; +; The function takes a pointer to the current hash values, a pointer to the +; input data, and a number of 64 byte blocks to process. Once all blocks have +; been processed, the digest pointer is updated with the resulting hash value. +; The function only processes complete blocks, there is no functionality to +; store partial blocks. All message padding and hash value initialization must +; be done outside the update function. +; +; The indented lines in the loop are instructions related to rounds processing. +; The non-indented lines are instructions related to the message schedule. +; +; Author: Sean Gulley <sean.m.gulley@xxxxxxxxx> +; Date: July 2013 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -Xvc -f x64 -rnasm -pnasm -DWINABI -o intel_sha_extensions_sha256_assembly.obj -g cv8 intel_sha_extensions_sha256_assembly.asm +; Linux: yasm -f elf64 -X gnu -g dwarf2 -o intel_sha_extensions_sha256_assembly.o intel_sha_extensions_sha256_assembly.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifdef WINABI +%define DIGEST_PTR rcx ; 1st arg +%define DATA_PTR rdx ; 2nd arg +%define NUM_BLKS r8 ; 3rd arg +%else +%define DIGEST_PTR rdi ; 1st arg +%define DATA_PTR rsi ; 2nd arg +%define NUM_BLKS rdx ; 3rd arg +%endif + +%define SHA256CONSTANTS rax + +%ifdef WINABI +%define RSPSAVE r9 + +struc frame +.xmm_save: resdq 5 +endstruc +%endif + +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define SHUF_MASK xmm8 + +%define ABEF_SAVE xmm9 +%define CDGH_SAVE xmm10 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_update(uint32_t *digest, const void *data, uint32_t numBlocks); +;; arg 1 : pointer to digest +;; arg 2 : pointer to input data +;; arg 3 : Number of blocks to process +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text +global sha256_update:function +align 32 +sha256_update: +%ifdef WINABI + mov RSPSAVE, rsp + sub rsp, frame_size + and rsp, ~0xF + + movdqa [rsp + frame.xmm_save + 0*16], xmm6 + movdqa [rsp + frame.xmm_save + 1*16], xmm7 + movdqa [rsp + frame.xmm_save + 2*16], xmm8 + movdqa [rsp + frame.xmm_save + 3*16], xmm9 + movdqa [rsp + frame.xmm_save + 4*16], xmm10 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz .done_hash + add NUM_BLKS, DATA_PTR ; pointer to end of data + + ;; load initial hash values + ;; Need to reorder these appropriately + ;; DCBA, HGFE -> ABEF, CDGH + movdqu STATE0, [DIGEST_PTR + 0*16] + movdqu STATE1, [DIGEST_PTR + 1*16] + + pshufd STATE0, STATE0, 0xB1 ; CDAB + pshufd STATE1, STATE1, 0x1B ; EFGH + movdqa MSGTMP4, STATE0 + palignr STATE0, STATE1, 8 ; ABEF + pblendw STATE1, MSGTMP4, 0xF0 ; CDGH + + movdqa SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + lea SHA256CONSTANTS,[K256 wrt rip] + +.loop0: + ;; Save hash values for addition after rounds + movdqa ABEF_SAVE, STATE0 + movdqa CDGH_SAVE, STATE1 + + ;; Rounds 0-3 + movdqu MSG, [DATA_PTR + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [SHA256CONSTANTS + 0*16] + sha256rnds2 STATE1, STATE0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + + ;; Rounds 4-7 + movdqu MSG, [DATA_PTR + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [SHA256CONSTANTS + 1*16] + sha256rnds2 STATE1, STATE0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP0, MSGTMP1 + + ;; Rounds 8-11 + movdqu MSG, [DATA_PTR + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [SHA256CONSTANTS + 2*16] + sha256rnds2 STATE1, STATE0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP1, MSGTMP2 + + ;; Rounds 12-15 + movdqu MSG, [DATA_PTR + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [SHA256CONSTANTS + 3*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP2, MSGTMP3 + + ;; Rounds 16-19 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 4*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP3, MSGTMP0 + + ;; Rounds 20-23 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 5*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP0, MSGTMP1 + + ;; Rounds 24-27 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 6*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP1, MSGTMP2 + + ;; Rounds 28-31 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 7*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP2, MSGTMP3 + + ;; Rounds 32-35 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 8*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP3, MSGTMP0 + + ;; Rounds 36-39 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 9*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP0, MSGTMP1 + + ;; Rounds 40-43 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 10*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP1, MSGTMP2 + + ;; Rounds 44-47 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 11*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP2, MSGTMP3 + + ;; Rounds 48-51 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 12*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + sha256msg1 MSGTMP3, MSGTMP0 + + ;; Rounds 52-55 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 13*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + + ;; Rounds 56-59 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 14*16] + sha256rnds2 STATE1, STATE0 + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + + ;; Rounds 60-63 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 15*16] + sha256rnds2 STATE1, STATE0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1 + + ;; Add current hash values with previously saved + paddd STATE0, ABEF_SAVE + paddd STATE1, CDGH_SAVE + + ;; Increment data pointer and loop if more to process + add DATA_PTR, 64 + cmp DATA_PTR, NUM_BLKS + jne .loop0 + + ;; Write hash values back in the correct order + pshufd STATE0, STATE0, 0x1B ; FEBA + pshufd STATE1, STATE1, 0xB1 ; DCHG + movdqa MSGTMP4, STATE0 + pblendw STATE0, STATE1, 0xF0 ; DCBA + palignr STATE1, MSGTMP4, 8 ; HGFE + + movdqu [DIGEST_PTR + 0*16], STATE0 + movdqu [DIGEST_PTR + 1*16], STATE1 + +.done_hash: +%ifdef WINABI + movdqa xmm6, [rsp + frame.xmm_save + 0*16] + movdqa xmm7, [rsp + frame.xmm_save + 1*16] + movdqa xmm8, [rsp + frame.xmm_save + 2*16] + movdqa xmm9, [rsp + frame.xmm_save + 3*16] + movdqa xmm10, [rsp + frame.xmm_save + 4*16] + mov rsp, RSPSAVE +%endif + + ret + +section .data +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 + diff --git a/sha256-x64shani/sha256_x64.c b/sha256-x64shani/sha256_x64.c new file mode 100644 index 0000000000..01f3417719 --- /dev/null +++ b/sha256-x64shani/sha256_x64.c @@ -0,0 +1,82 @@ +#include "../git-compat-util.h" +#include "sha256_x64.h" +#include <byteswap.h> + +static int shani = -1; + +void shani_SHA256_Init(shani_SHA256_CTX *ctx) +{ + ctx->total_len = 0; + ctx->len = 0; + ctx->h[0] = 0x6a09e667ul; + ctx->h[1] = 0xbb67ae85ul; + ctx->h[2] = 0x3c6ef372ul; + ctx->h[3] = 0xa54ff53aul; + ctx->h[4] = 0x510e527ful; + ctx->h[5] = 0x9b05688cul; + ctx->h[6] = 0x1f83d9abul; + ctx->h[7] = 0x5be0cd19ul; + + if (shani != 1) { + shani = 1; + printf("set shani to 1\n"); + } else { + printf("shani is 1 already\n"); + } +} + +void sha256_update(uint32_t *digest, const void *data, uint32_t numBlocks); +void shani_SHA256_Update(shani_SHA256_CTX *c, const void *data, size_t len) +{ + while (len > 0) + { + if (c->len == 0 && len >= 64) + { + sha256_update(c->h, data, len/64); + unsigned long bytes_hashed = (len/64)*64; + data += bytes_hashed; + len -= bytes_hashed; + c->total_len += bytes_hashed; + continue; + } +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + size_t cpy = MIN(64 - c->len, len); +#undef MIN + memcpy(c->msgbuf + c->len, data, cpy); + data += cpy; + len -= cpy; + c->len += cpy; + c->total_len += cpy; + + if (c->len == 64) + { + sha256_update(c->h, c->msgbuf, 1); + c->len = 0; + } + } +} + +void shani_SHA256_Final(unsigned char *digest, shani_SHA256_CTX *c) +{ + c->msgbuf[c->len] = 0x80; + ++c->len; + memset(c->msgbuf + c->len, 0, 64 - c->len); + if (c->len > (64-8)) + { + sha256_update(c->h, c->msgbuf, 1); + memset(c->msgbuf, 0, 64); + c->len = 0; + } + + for (int ibyte = 0; ibyte < 8; ++ibyte) + c->msgbuf[63-ibyte] = (uint8_t)((c->total_len * 8ULL) >> (ibyte*8)); + + sha256_update(c->h, c->msgbuf, 1); + c->len = 0; + + // flip endianness + int i; + for (i = 0; i < 32; i += 4) { + *(uint32_t *)(digest + i) = __bswap_32(c->h[i / 4]); + } +} \ No newline at end of file diff --git a/sha256-x64shani/sha256_x64.h b/sha256-x64shani/sha256_x64.h new file mode 100644 index 0000000000..25f1ff67ba --- /dev/null +++ b/sha256-x64shani/sha256_x64.h @@ -0,0 +1,23 @@ +/* + * SHA256 routine using x64 SHA-NI instructions + */ + +#define SHANI_SHA256_BLKSIZE 64 + +struct shani_SHA256_CTX { + uint32_t h[8]; + int len; + uint64_t total_len; + char msgbuf[SHANI_SHA256_BLKSIZE]; +}; + +typedef struct shani_SHA256_CTX shani_SHA256_CTX; + +void shani_SHA256_Init(shani_SHA256_CTX *ctx); +void shani_SHA256_Update(shani_SHA256_CTX *c, const void *data, size_t len); +void shani_SHA256_Final(unsigned char *digest, shani_SHA256_CTX *c); + +#define platform_SHA256_CTX shani_SHA256_CTX +#define platform_SHA256_Init shani_SHA256_Init +#define platform_SHA256_Update shani_SHA256_Update +#define platform_SHA256_Final shani_SHA256_Final \ No newline at end of file -- 2.41.0