[PATCH] Add x64 SHA-NI implementation to sha1 and sha256 in git ?

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Looking for comments on if this is a good idea to add SHA-NI option to git, 
And if so, what'd be a good implementation.

The attached patch is more of a proof of concept,  using a sha-ni example 
found on internet and have it tapped into git's "hash-ll.h" when it sees make 
flags "SHA1_SHANI=1" and/or "SHA256_SHANI=1" for sha1/sha256 respectively.

"git hash-object" is about 3-ish times faster for me

There are few topics that i'm uncertain about. 

1. Is it good idea to have machine-specific asm codes in git. I read there was 
commit fd1ec82547 which removed assembly implementation for PPC_SHA1. Does that 
imply maintainers doesn't want machine-specific assembly in source?

2. If it is cool to have machine-specific codes in git, what's a good logic for
their on/off? Build time flag? Run time flag? Run time automatic detection to 
corresponding instructions? 

3. If i can include machine-specific codes, what'd be a good way/idea to cleanup
the asm? I suppose git maintainers do not want someone else's license header?

Regards,
Feiyang


---
 Makefile                                      |  28 ++
 hash-ll.h                                     |   8 +-
 .../intel_sha_extensions_sha1_assembly.asm    | 308 +++++++++++++++
 sha1-x64shani/sha1_x64.c                      |  71 ++++
 sha1-x64shani/sha1_x64.h                      |  21 +
 .../intel_sha_extensions_sha256_assembly.asm  | 371 ++++++++++++++++++
 sha256-x64shani/sha256_x64.c                  |  82 ++++
 sha256-x64shani/sha256_x64.h                  |  23 ++
 8 files changed, 910 insertions(+), 2 deletions(-)
 create mode 100644 sha1-x64shani/intel_sha_extensions_sha1_assembly.asm
 create mode 100644 sha1-x64shani/sha1_x64.c
 create mode 100644 sha1-x64shani/sha1_x64.h
 create mode 100644 sha256-x64shani/intel_sha_extensions_sha256_assembly.asm
 create mode 100644 sha256-x64shani/sha256_x64.c
 create mode 100644 sha256-x64shani/sha256_x64.h

diff --git a/Makefile b/Makefile
index e440728c24..6892af2ee5 100644
--- a/Makefile
+++ b/Makefile
@@ -1917,6 +1917,11 @@ ifdef PPC_SHA1
 $(error the PPC_SHA1 flag has been removed along with the PowerPC-specific SHA-1 implementation.)
 endif
 
+ifdef SHA1_SHANI
+	LIB_OBJS += sha1-x64shani/sha1_x64.o
+	LIB_OBJS += sha1-x64shani/intel_sha_extensions_sha1_assembly.o
+	BASIC_CFLAGS += -DSHA1_SHANI
+else
 ifdef OPENSSL_SHA1
 	EXTLIBS += $(LIB_4_CRYPTO)
 	BASIC_CFLAGS += -DSHA1_OPENSSL
@@ -1957,7 +1962,13 @@ endif
 endif
 endif
 endif
+endif
 
+ifdef SHA256_SHANI
+	LIB_OBJS += sha256-x64shani/sha256_x64.o
+	LIB_OBJS += sha256-x64shani/intel_sha_extensions_sha256_assembly.o
+	BASIC_CFLAGS += -DSHA256_SHANI
+else
 ifdef OPENSSL_SHA256
 	EXTLIBS += $(LIB_4_CRYPTO)
 	BASIC_CFLAGS += -DSHA256_OPENSSL
@@ -1975,6 +1986,9 @@ else
 endif
 endif
 endif
+endif
+
+$(info $$LIB_OBJS is [${LIB_OBJS}])
 
 ifdef SHA1_MAX_BLOCK_SIZE
 	LIB_OBJS += compat/sha1-chunked.o
@@ -2711,6 +2725,20 @@ missing_compdb_dir =
 compdb_args =
 endif
 
+$(info $$OBJECTS is [${OBJECTS}])
+
+### for SHA-NI 
+YASM_SRC := $(wildcard $(OBJECTS:o=asm))
+YASM_OBJ := $(YASM_SRC:asm=o)
+OBJECTS := $(filter-out $(YASM_OBJ),$(OBJECTS))
+
+$(info $$YASM_SRC is [${YASM_SRC}])
+$(info $$YASM_OBJ is [${YASM_OBJ}])
+$(info $$OBJECTS is [${OBJECTS}])
+
+$(YASM_OBJ): %.o: %.asm $(missing_dep_dirs)
+	$(QUIET_CC)yasm $< -f elf64 -X gnu -g dwarf2 -o $@
+
 $(OBJECTS): %.o: %.c GIT-CFLAGS $(missing_dep_dirs) $(missing_compdb_dir)
 	$(QUIET_CC)$(CC) -o $*.o -c $(dep_args) $(compdb_args) $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $<
 
diff --git a/hash-ll.h b/hash-ll.h
index 8050925137..31d9e91f26 100644
--- a/hash-ll.h
+++ b/hash-ll.h
@@ -1,7 +1,9 @@
 #ifndef HASH_LL_H
 #define HASH_LL_H
 
-#if defined(SHA1_APPLE)
+#if defined(SHA1_SHANI)
+#include "sha1-x64shani/sha1_x64.h"
+#elif defined(SHA1_APPLE)
 #include <CommonCrypto/CommonDigest.h>
 #elif defined(SHA1_OPENSSL)
 #include <openssl/sha.h>
@@ -11,7 +13,9 @@
 #include "block-sha1/sha1.h"
 #endif
 
-#if defined(SHA256_NETTLE)
+#if defined(SHA256_SHANI)
+#include "sha256-x64shani/sha256_x64.h"
+#elif defined(SHA256_NETTLE)
 #include "sha256/nettle.h"
 #elif defined(SHA256_GCRYPT)
 #define SHA256_NEEDS_CLONE_HELPER
diff --git a/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm b/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm
new file mode 100644
index 0000000000..ba59825966
--- /dev/null
+++ b/sha1-x64shani/intel_sha_extensions_sha1_assembly.asm
@@ -0,0 +1,308 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2013, Intel Corporation 
+; 
+; All rights reserved. 
+; 
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met: 
+; 
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.  
+; 
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution. 
+; 
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission. 
+; 
+; 
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Intel SHA Extensions optimized implementation of a SHA-1 update function 
+;
+; The function takes a pointer to the current hash values, a pointer to the 
+; input data, and a number of 64 byte blocks to process.  Once all blocks have 
+; been processed, the digest pointer is  updated with the resulting hash value.
+; The function only processes complete blocks, there is no functionality to 
+; store partial blocks.  All message padding and hash value initialization must
+; be done outside the update function.  
+;
+; The indented lines in the loop are instructions related to rounds processing.
+; The non-indented lines are instructions related to the message schedule.
+;
+; Author: Sean Gulley <sean.m.gulley@xxxxxxxxx>
+; Date:   July 2013
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  
+;  yasm -Xvc -f x64 -rnasm -pnasm -DWINABI -o intel_sha_extensions_sha1_assembly.obj -g cv8 intel_sha_extensions_sha1_assembly.asm
+; Linux:    
+;  yasm -f elf64 -X gnu -g dwarf2 -o intel_sha_extensions_sha1_assembly.o intel_sha_extensions_sha1_assembly.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifdef WINABI
+%define DIGEST_PTR	rcx 	; 1st arg
+%define DATA_PTR	rdx 	; 2nd arg
+%define NUM_BLKS 	r8	; 3rd arg
+%else
+%define DIGEST_PTR	rdi	; 1st arg
+%define DATA_PTR	rsi	; 2nd arg
+%define NUM_BLKS	rdx	; 3rd arg
+%endif
+
+%define RSPSAVE		rax
+
+struc frame
+%ifdef WINABI
+.xmm_save:	resdq	2
+%endif
+.hash_save:	resdq	2
+endstruc
+
+%define ABCD		xmm0
+%define E0		xmm1	; Need two E's b/c they ping pong
+%define E1		xmm2
+%define MSG0		xmm3
+%define MSG1		xmm4
+%define MSG2		xmm5
+%define MSG3		xmm6
+%define SHUF_MASK	xmm7
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_update(uint32_t *digest, const void *data, uint32_t numBlocks);
+;; arg 1 : pointer to digest
+;; arg 2 : pointer to input data
+;; arg 3 : Number of blocks to process
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+global sha1_update:function
+align 32
+sha1_update:
+	mov		RSPSAVE, rsp
+	sub		rsp, frame_size
+	and		rsp, ~0xF
+
+%ifdef WINABI
+	movdqa		[rsp + frame.xmm_save + 0*16], xmm6
+	movdqa		[rsp + frame.xmm_save + 1*16], xmm7
+%endif
+
+	shl		NUM_BLKS, 6		; convert to bytes
+	jz		.done_hash
+	add		NUM_BLKS, DATA_PTR	; pointer to end of data
+
+	;; load initial hash values
+	pinsrd		E0,   [DIGEST_PTR + 1*16], 3
+	movdqu		ABCD, [DIGEST_PTR + 0*16]
+	pand		E0,   [UPPER_WORD_MASK wrt rip]
+	pshufd		ABCD, ABCD, 0x1B
+
+	movdqa		SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+
+.loop0:
+	;; Save hash values for addition after rounds
+	movdqa		[rsp + frame.hash_save + 0*16], E0
+	movdqa		[rsp + frame.hash_save + 1*16], ABCD
+
+	;; Rounds 0-3
+	movdqu		MSG0, [DATA_PTR + 0*16]
+	pshufb		MSG0, SHUF_MASK
+		paddd		E0, MSG0
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+
+	;; Rounds 4-7
+	movdqu		MSG1, [DATA_PTR + 1*16]
+	pshufb		MSG1, SHUF_MASK
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG0, MSG1
+
+	;; Rounds 8-11
+	movdqu		MSG2, [DATA_PTR + 2*16]
+	pshufb		MSG2, SHUF_MASK
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	;; Rounds 12-15
+	movdqu		MSG3, [DATA_PTR + 3*16]
+	pshufb		MSG3, SHUF_MASK
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	;; Rounds 16-19
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+	
+	;; Rounds 20-23
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+	
+	;; Rounds 24-27
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+	
+	;; Rounds 28-31
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	;; Rounds 32-35
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+	
+	;; Rounds 36-39
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+	
+	;; Rounds 40-43
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+	
+	;; Rounds 44-47
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	;; Rounds 48-51
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+	
+	;; Rounds 52-55
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+	
+	;; Rounds 56-59
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+	
+	;; Rounds 60-63
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 3
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	;; Rounds 64-67
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 3
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+	
+	;; Rounds 68-71
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 3
+	pxor		MSG3, MSG1
+	
+	;; Rounds 72-75
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 3
+	
+	;; Rounds 76-79
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 3
+
+	;; Add current hash values with previously saved
+	sha1nexte	E0,   [rsp + frame.hash_save + 0*16]
+	paddd		ABCD, [rsp + frame.hash_save + 1*16]
+
+	;; Increment data pointer and loop if more to process
+	add		DATA_PTR, 64
+	cmp		DATA_PTR, NUM_BLKS
+	jne		.loop0
+
+	;; Write hash values back in the correct order
+	pshufd		ABCD, ABCD, 0x1B
+	movdqu		[DIGEST_PTR + 0*16], ABCD
+	pextrd		[DIGEST_PTR + 1*16], E0, 3
+
+.done_hash:
+%ifdef WINABI
+	movdqa		xmm6, [rsp + frame.xmm_save + 0*16]
+	movdqa		xmm7, [rsp + frame.xmm_save + 1*16]
+%endif
+	mov		rsp, RSPSAVE
+
+	ret	
+
+section .data
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:         ddq 0xFFFFFFFF000000000000000000000000
+
diff --git a/sha1-x64shani/sha1_x64.c b/sha1-x64shani/sha1_x64.c
new file mode 100644
index 0000000000..0ed0d49ed6
--- /dev/null
+++ b/sha1-x64shani/sha1_x64.c
@@ -0,0 +1,71 @@
+#include "../git-compat-util.h"
+#include "sha1_x64.h"
+#include <byteswap.h>
+
+// static inline void shani_SHA1_Init(shani_SHA_CTX *c);
+
+void shani_SHA1_Init(shani_SHA_CTX *c)
+{
+    static const uint32_t sha1InitialDigest[5] = {
+           0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+    };
+    memcpy(c->h, sha1InitialDigest, 20);
+    c->len = 0;
+    c->total_len = 0;
+}
+
+void sha1_update(void *c, const void *d, int l);
+void shani_SHA1_Update(shani_SHA_CTX *c, const void *data, unsigned long len)
+{
+    while (len > 0)
+    {
+        if (c->len == 0 && len >= 64)
+        {
+            sha1_update(c->h, data, len/64);
+            unsigned long bytes_hashed = (len/64)*64;
+            data += bytes_hashed;
+            len -= bytes_hashed;
+            c->total_len += bytes_hashed;
+            continue;
+        }
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+        size_t cpy = MIN(64 - c->len, len);
+#undef MIN
+        memcpy(c->msgbuf + c->len, data, cpy);
+        data += cpy;
+        len -= cpy;
+        c->len += cpy;
+        c->total_len += cpy;
+
+        if (c->len == 64)
+        {
+            sha1_update(c->h, c->msgbuf, 1);
+            c->len = 0;
+        }
+    }
+}
+
+void shani_SHA1_Final(unsigned char *md, shani_SHA_CTX *c)
+{
+    c->msgbuf[c->len] = 0x80;
+    ++c->len;
+    memset(c->msgbuf + c->len, 0, 64 - c->len);
+    if (c->len > (64-8))
+    {
+        sha1_update(c->h, c->msgbuf, 1);
+        memset(c->msgbuf, 0, 64);
+        c->len = 0;
+    }
+
+    for (int ibyte = 0; ibyte < 8; ++ibyte)
+       c->msgbuf[63-ibyte] = (uint8_t)((c->total_len * 8ULL) >> (ibyte*8));
+
+    sha1_update(c->h, c->msgbuf, 1);
+    c->len = 0;    
+
+    // flip endianness
+    int i;
+    for (i = 0; i < 20; i += 4) {
+        *(uint32_t *)(md + i) = __bswap_32(c->h[i / 4]);
+    }
+}
diff --git a/sha1-x64shani/sha1_x64.h b/sha1-x64shani/sha1_x64.h
new file mode 100644
index 0000000000..ca1ec116cf
--- /dev/null
+++ b/sha1-x64shani/sha1_x64.h
@@ -0,0 +1,21 @@
+/*
+ * SHA1 routine using x64 SHA-NI instructions
+ */
+
+typedef struct shani_SHA_CTX
+{
+    uint32_t h[5];
+    int len;
+    uint64_t total_len;
+    char msgbuf[64];
+} shani_SHA_CTX;
+
+
+void shani_SHA1_Init(shani_SHA_CTX *ctx);
+void shani_SHA1_Update(shani_SHA_CTX *ctx, const void *dataIn, size_t len);
+void shani_SHA1_Final(unsigned char hashout[20], shani_SHA_CTX *ctx);
+
+#define platform_SHA_CTX	shani_SHA_CTX
+#define platform_SHA1_Init	shani_SHA1_Init
+#define platform_SHA1_Update	shani_SHA1_Update
+#define platform_SHA1_Final	shani_SHA1_Final
diff --git a/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm b/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm
new file mode 100644
index 0000000000..b587e82154
--- /dev/null
+++ b/sha256-x64shani/intel_sha_extensions_sha256_assembly.asm
@@ -0,0 +1,371 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2013, Intel Corporation 
+; 
+; All rights reserved. 
+; 
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met: 
+; 
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.  
+; 
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution. 
+; 
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission. 
+; 
+; 
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Intel SHA Extensions optimized implementation of a SHA-256 update function 
+;
+; The function takes a pointer to the current hash values, a pointer to the 
+; input data, and a number of 64 byte blocks to process.  Once all blocks have 
+; been processed, the digest pointer is  updated with the resulting hash value.
+; The function only processes complete blocks, there is no functionality to 
+; store partial blocks.  All message padding and hash value initialization must
+; be done outside the update function.  
+;
+; The indented lines in the loop are instructions related to rounds processing.
+; The non-indented lines are instructions related to the message schedule.
+;
+; Author: Sean Gulley <sean.m.gulley@xxxxxxxxx>
+; Date:   July 2013
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -DWINABI -o intel_sha_extensions_sha256_assembly.obj -g cv8 intel_sha_extensions_sha256_assembly.asm
+; Linux:    yasm -f elf64 -X gnu -g dwarf2 -o intel_sha_extensions_sha256_assembly.o intel_sha_extensions_sha256_assembly.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifdef WINABI
+%define DIGEST_PTR	rcx 	; 1st arg
+%define DATA_PTR	rdx 	; 2nd arg
+%define NUM_BLKS 	r8	; 3rd arg
+%else
+%define DIGEST_PTR	rdi	; 1st arg
+%define DATA_PTR	rsi	; 2nd arg
+%define NUM_BLKS	rdx	; 3rd arg
+%endif
+
+%define SHA256CONSTANTS	rax
+
+%ifdef WINABI
+%define RSPSAVE		r9
+
+struc frame
+.xmm_save:	resdq	5
+endstruc
+%endif
+
+%define MSG		xmm0
+%define STATE0		xmm1
+%define STATE1		xmm2
+%define MSGTMP0		xmm3
+%define MSGTMP1		xmm4
+%define MSGTMP2		xmm5
+%define MSGTMP3		xmm6
+%define MSGTMP4		xmm7
+
+%define SHUF_MASK	xmm8
+
+%define ABEF_SAVE	xmm9
+%define CDGH_SAVE	xmm10
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_update(uint32_t *digest, const void *data, uint32_t numBlocks);
+;; arg 1 : pointer to digest
+;; arg 2 : pointer to input data
+;; arg 3 : Number of blocks to process
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+global sha256_update:function
+align 32
+sha256_update:
+%ifdef WINABI
+	mov		RSPSAVE, rsp
+	sub		rsp, frame_size
+	and		rsp, ~0xF
+
+	movdqa		[rsp + frame.xmm_save + 0*16], xmm6
+	movdqa		[rsp + frame.xmm_save + 1*16], xmm7
+	movdqa		[rsp + frame.xmm_save + 2*16], xmm8
+	movdqa		[rsp + frame.xmm_save + 3*16], xmm9
+	movdqa		[rsp + frame.xmm_save + 4*16], xmm10
+%endif
+
+	shl		NUM_BLKS, 6		; convert to bytes
+	jz		.done_hash
+	add		NUM_BLKS, DATA_PTR	; pointer to end of data
+
+	;; load initial hash values
+	;; Need to reorder these appropriately
+	;; DCBA, HGFE -> ABEF, CDGH
+	movdqu		STATE0, [DIGEST_PTR + 0*16]
+	movdqu		STATE1, [DIGEST_PTR + 1*16]
+
+	pshufd		STATE0,  STATE0,  0xB1	; CDAB
+	pshufd		STATE1,  STATE1,  0x1B	; EFGH
+	movdqa		MSGTMP4, STATE0
+	palignr		STATE0,  STATE1,  8	; ABEF
+	pblendw		STATE1,  MSGTMP4, 0xF0	; CDGH
+
+	movdqa		SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+	lea		SHA256CONSTANTS,[K256 wrt rip]
+
+.loop0:
+	;; Save hash values for addition after rounds
+	movdqa		ABEF_SAVE, STATE0
+	movdqa		CDGH_SAVE, STATE1
+
+	;; Rounds 0-3
+	movdqu		MSG, [DATA_PTR + 0*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP0, MSG
+		paddd		MSG, [SHA256CONSTANTS + 0*16]
+		sha256rnds2	STATE1, STATE0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+
+	;; Rounds 4-7
+	movdqu		MSG, [DATA_PTR + 1*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP1, MSG
+		paddd		MSG, [SHA256CONSTANTS + 1*16]
+		sha256rnds2	STATE1, STATE0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP0, MSGTMP1
+
+	;; Rounds 8-11
+	movdqu		MSG, [DATA_PTR + 2*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP2, MSG
+		paddd		MSG, [SHA256CONSTANTS + 2*16]
+		sha256rnds2	STATE1, STATE0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP1, MSGTMP2
+
+	;; Rounds 12-15
+	movdqu		MSG, [DATA_PTR + 3*16]
+	pshufb		MSG, SHUF_MASK
+	movdqa		MSGTMP3, MSG
+		paddd		MSG, [SHA256CONSTANTS + 3*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP3
+	palignr		MSGTMP4, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP4
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP2, MSGTMP3
+
+	;; Rounds 16-19
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 4*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP0
+	palignr		MSGTMP4, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP4
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP3, MSGTMP0
+
+	;; Rounds 20-23
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 5*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP1
+	palignr		MSGTMP4, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP4
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP0, MSGTMP1
+
+	;; Rounds 24-27
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 6*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP2
+	palignr		MSGTMP4, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP4
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP1, MSGTMP2
+
+	;; Rounds 28-31
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 7*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP3
+	palignr		MSGTMP4, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP4
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP2, MSGTMP3
+
+	;; Rounds 32-35
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 8*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP0
+	palignr		MSGTMP4, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP4
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP3, MSGTMP0
+
+	;; Rounds 36-39
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 9*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP1
+	palignr		MSGTMP4, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP4
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP0, MSGTMP1
+
+	;; Rounds 40-43
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 10*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP2
+	palignr		MSGTMP4, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP4
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP1, MSGTMP2
+
+	;; Rounds 44-47
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 11*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP3
+	palignr		MSGTMP4, MSGTMP2, 4
+	paddd		MSGTMP0, MSGTMP4
+	sha256msg2	MSGTMP0, MSGTMP3
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP2, MSGTMP3
+
+	;; Rounds 48-51
+	movdqa		MSG, MSGTMP0
+		paddd		MSG, [SHA256CONSTANTS + 12*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP0
+	palignr		MSGTMP4, MSGTMP3, 4
+	paddd		MSGTMP1, MSGTMP4
+	sha256msg2	MSGTMP1, MSGTMP0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+	sha256msg1	MSGTMP3, MSGTMP0
+
+	;; Rounds 52-55
+	movdqa		MSG, MSGTMP1
+		paddd		MSG, [SHA256CONSTANTS + 13*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP1
+	palignr		MSGTMP4, MSGTMP0, 4
+	paddd		MSGTMP2, MSGTMP4
+	sha256msg2	MSGTMP2, MSGTMP1
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+
+	;; Rounds 56-59
+	movdqa		MSG, MSGTMP2
+		paddd		MSG, [SHA256CONSTANTS + 14*16]
+		sha256rnds2	STATE1, STATE0
+	movdqa		MSGTMP4, MSGTMP2
+	palignr		MSGTMP4, MSGTMP1, 4
+	paddd		MSGTMP3, MSGTMP4
+	sha256msg2	MSGTMP3, MSGTMP2
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+
+	;; Rounds 60-63
+	movdqa		MSG, MSGTMP3
+		paddd		MSG, [SHA256CONSTANTS + 15*16]
+		sha256rnds2	STATE1, STATE0
+		pshufd 		MSG, MSG, 0x0E
+		sha256rnds2	STATE0, STATE1
+
+	;; Add current hash values with previously saved
+	paddd		STATE0, ABEF_SAVE
+	paddd		STATE1, CDGH_SAVE
+
+	;; Increment data pointer and loop if more to process
+	add		DATA_PTR, 64
+	cmp		DATA_PTR, NUM_BLKS
+	jne		.loop0
+
+	;; Write hash values back in the correct order
+	pshufd		STATE0,  STATE0,  0x1B	; FEBA
+	pshufd		STATE1,  STATE1,  0xB1	; DCHG
+	movdqa		MSGTMP4, STATE0
+	pblendw		STATE0,  STATE1,  0xF0	; DCBA
+	palignr		STATE1,  MSGTMP4, 8	; HGFE
+
+	movdqu		[DIGEST_PTR + 0*16], STATE0
+	movdqu		[DIGEST_PTR + 1*16], STATE1
+
+.done_hash:
+%ifdef WINABI
+	movdqa		xmm6,  [rsp + frame.xmm_save + 0*16]
+	movdqa		xmm7,  [rsp + frame.xmm_save + 1*16]
+	movdqa		xmm8,  [rsp + frame.xmm_save + 2*16]
+	movdqa		xmm9,  [rsp + frame.xmm_save + 3*16]
+	movdqa		xmm10, [rsp + frame.xmm_save + 4*16]
+	mov		rsp, RSPSAVE
+%endif
+
+	ret	
+	
+section .data
+align 64
+K256:
+	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
+
diff --git a/sha256-x64shani/sha256_x64.c b/sha256-x64shani/sha256_x64.c
new file mode 100644
index 0000000000..01f3417719
--- /dev/null
+++ b/sha256-x64shani/sha256_x64.c
@@ -0,0 +1,82 @@
+#include "../git-compat-util.h"
+#include "sha256_x64.h"
+#include <byteswap.h>
+
+static int shani = -1;
+
+void shani_SHA256_Init(shani_SHA256_CTX *ctx)
+{
+	ctx->total_len = 0;
+	ctx->len = 0;
+	ctx->h[0] = 0x6a09e667ul;
+	ctx->h[1] = 0xbb67ae85ul;
+	ctx->h[2] = 0x3c6ef372ul;
+	ctx->h[3] = 0xa54ff53aul;
+	ctx->h[4] = 0x510e527ful;
+	ctx->h[5] = 0x9b05688cul;
+	ctx->h[6] = 0x1f83d9abul;
+	ctx->h[7] = 0x5be0cd19ul;
+
+    if (shani != 1) {
+        shani = 1;
+        printf("set shani to 1\n");
+    } else {
+        printf("shani is 1 already\n");
+    }
+}
+
+void sha256_update(uint32_t *digest, const void *data, uint32_t numBlocks);
+void shani_SHA256_Update(shani_SHA256_CTX *c, const void *data, size_t len)
+{
+    while (len > 0)
+    {
+        if (c->len == 0 && len >= 64)
+        {
+            sha256_update(c->h, data, len/64);
+            unsigned long bytes_hashed = (len/64)*64;
+            data += bytes_hashed;
+            len -= bytes_hashed;
+            c->total_len += bytes_hashed;
+            continue;
+        }
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+        size_t cpy = MIN(64 - c->len, len);
+#undef MIN
+        memcpy(c->msgbuf + c->len, data, cpy);
+        data += cpy;
+        len -= cpy;
+        c->len += cpy;
+        c->total_len += cpy;
+
+        if (c->len == 64)
+        {
+            sha256_update(c->h, c->msgbuf, 1);
+            c->len = 0;
+        }
+    }
+}
+
+void shani_SHA256_Final(unsigned char *digest, shani_SHA256_CTX *c)
+{
+    c->msgbuf[c->len] = 0x80;
+    ++c->len;
+    memset(c->msgbuf + c->len, 0, 64 - c->len);
+    if (c->len > (64-8))
+    {
+        sha256_update(c->h, c->msgbuf, 1);
+        memset(c->msgbuf, 0, 64);
+        c->len = 0;
+    }
+
+    for (int ibyte = 0; ibyte < 8; ++ibyte)
+       c->msgbuf[63-ibyte] = (uint8_t)((c->total_len * 8ULL) >> (ibyte*8));
+
+    sha256_update(c->h, c->msgbuf, 1);
+    c->len = 0;
+    
+    // flip endianness
+    int i;
+    for (i = 0; i < 32; i += 4) {
+        *(uint32_t *)(digest + i) = __bswap_32(c->h[i / 4]);
+    }
+}
\ No newline at end of file
diff --git a/sha256-x64shani/sha256_x64.h b/sha256-x64shani/sha256_x64.h
new file mode 100644
index 0000000000..25f1ff67ba
--- /dev/null
+++ b/sha256-x64shani/sha256_x64.h
@@ -0,0 +1,23 @@
+/*
+ * SHA256 routine using x64 SHA-NI instructions
+ */
+
+#define SHANI_SHA256_BLKSIZE 64
+
+struct shani_SHA256_CTX {
+    uint32_t h[8];
+    int len;
+    uint64_t total_len;
+    char msgbuf[SHANI_SHA256_BLKSIZE];
+};
+
+typedef struct shani_SHA256_CTX shani_SHA256_CTX;
+
+void shani_SHA256_Init(shani_SHA256_CTX *ctx);
+void shani_SHA256_Update(shani_SHA256_CTX *c, const void *data, size_t len);
+void shani_SHA256_Final(unsigned char *digest, shani_SHA256_CTX *c);
+
+#define platform_SHA256_CTX	    shani_SHA256_CTX
+#define platform_SHA256_Init	shani_SHA256_Init
+#define platform_SHA256_Update	shani_SHA256_Update
+#define platform_SHA256_Final	shani_SHA256_Final
\ No newline at end of file
-- 
2.41.0




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux