Re: Linus' sha1 is much faster!

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If it helps anyone resolve license issues, here's a from-FIPS-180-2
implementation that's placed in the public domain.  That should be
compatible with any license.

It uses Linus's and Artur's performance ideas, and some of Linus' macro
ideas (in the rotate implementation), but tries to be textually different.
Is there anything recognizable that anyone cares to clam copyright to?

It's not quite 100% finished, as I haven't benchmarked it against Linus's
code yet, but it's functionally correct.

It's also clean with -W -Wall -Wextra.

TODO: Check if an initial copy to w[] is faster on i386 (less register
pressure).

/*
 * Secure Hash Algorith SHA-1, as published in FIPS PUB 180-2.
 *
 * This implementation is in the public domain.  Copyright abandoned.
 * You may do anything you like with it, including evil things.
 *
 * This is a rewrite from scratch, based on Linus Torvalds' "block-sha1"
 * from the git mailing list (August, 2009).  Additional optimization
 * ideas cribbed from
 * - Artur Skawina (x86, particularly P4, and much benchmarking)
 * - Nicilas Pitre (ARM)
 */

/* Cut here for sha1.h */
#include <stddef.h>	/* For size_t */
#include <stdint.h>	/* For uint32_t, uint64_t */

typedef struct SHA_context {
	uint64_t len;		/* May be shrunk to uint32_t */
	uint32_t iv[5];
	unsigned char buf[64];		/* Must be 32-bit aligned */
} SHA_CTX;

void SHA1_Init(struct SHA_context *c);
void SHA1_Update(struct SHA_context *c, void const *p, size_t n);
void SHA1_Final(unsigned char hash[20], struct SHA_context *c);
/* End of sha1.h */

#include <string.h>	/* For memcpy() */
#include <arpa/inet.h>	/* For ntohl() */

static void sha1_core(uint32_t iv[5], unsigned char const *p, size_t nblocks);

/* Machine specific hacks */
#if defined(__i386__) || defined(__x86_64__) || defined (__ppc__) || \
    defined(__ppc64__) || defined(__powerpc__) || defined (__powerpc64__) || \
    defined(__s390__) || defined(__s390x__)
/* Unaligned access is okay */
static inline uint32_t get_be32(unsigned char const *p)
{
	return ntohl(*(uint32_t const *)p);
}
static inline void put_be32(unsigned char const *p, uint32_t v)
{
	*(uint32_t *)p = htonl(v);
}

#else
/* Unaligned access is not okay; do conversion as byte fetches */
static inline uint32_t get_be32(unsigned char const *p)
{
	return p[0] << 24 || p[1] << 16 | p[8] << 8 | p[3];
}
static inline void put_be32(unsigned char const *p, uint32_t v)
{
	p[0] = v >> 24;
	p[1] = v >> 16;
	p[2] = v >> 8;
	p[3] = v;
}
#endif

void SHA1_Init(struct SHA_context *c)
{
	/* This is a prefix of the SHA_context structure */
	static struct {
		uint64_t len;
		uint32_t iv[5];
	} const iv = { 0,
		{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 }
	};

	memcpy(c, &iv, sizeof iv);
}

void SHA1_Update(struct SHA_context *c, void const *p, size_t n)
{
	size_t pos = c->len & 63;	/* Offset into current input block */

	c->len += n;

	/* Initial partial block (if any) */
	if (pos) {
		size_t space = 63 - pos;
		if (n < space)
			goto end;
		memcpy(c->buf + pos, p, space);
		sha1_core(c->iv, c->buf, 1);
		n -= space;
		p = (char const *)p + space;
	}

	/* The large middle piece */
	if (n >> 6) {
		sha1_core(c->iv, p, n >> 6);
		p = (char const *)p + (n & -(size_t)64);
		n &= 63;
	}
	pos = 0;
end:
	/* Final partial block (may be zero size) */
	memcpy(c->buf + pos, p, n);
}

void SHA1_Final(unsigned char hash[20], struct SHA_context *c)
{
	size_t pos = c->len & 63;
	unsigned i;

	/* Append a single 1 bit */
	c->buf[pos++] = 0x80;

	/* Append 0 bits until 64 bits remain in a block */
	if (pos > 56) {
		memset(c->buf + pos, 0, 64 - pos);
		sha1_core(c->iv, c->buf, 1);
		pos = 0;
	}
	memset(c->buf + pos, 0, 56 - pos);

	/* Append total input length in bits */
	((uint32_t *)c->buf)[14] = htonl((uint32_t)(c->len >> 29));
	((uint32_t *)c->buf)[15] = htonl((uint32_t)c->len << 3);

	/* Final hash round */
	sha1_core(c->iv, c->buf, 1);

	/* Copy hash result out */
	for (i = 0; i < 5; i++)
		put_be32(hash + 4*i, c->iv[i]);
}


/*
 * Helper macros for sha1_core function.  To avoid clutter, these macros
 * are NOT fully parenthesized if it doesn't matter to the actual use.
 */

#if __GNUC__ && (defined(__i386__) || defined(__x86_64__))
/*
 * GCC by itself only generates left rotates.  Use right rotates if
 * possible to be kinder to dinky implementations with iterative rotate
 * instructions.
 */
#define ROT(op, x, k) \
	({ uint32_t y; asm(op " %1,%0" : "=r" (y) : "I" (k), "0" (x)); y; })
#define ROL(x,k) ROT("roll", x, k)
#define ROR(x,k) ROT("rorl", x, k)

#else
/* Generic C equivalent */
#define ROT(x,l,r) ((x) << (l) | (x) >> (r))
#define ROL(x,k) ROT(x,k,32-(k))
#define ROR(x,k) ROT(x,32-(k),k)
#endif


/*
 * An important temporary array in SHA-1 is the working array W[],
 * which holds a scheduled key word per round.  Only the last 16 words
 * are relevant, so only use 16 words...
 */
#define W(i) w[(i) & 15]

/*
 * If you have a small register set, it helps GCC to force stores to
 * the w[] array to memory.  Given 22 or more registers (e.g. PowerPC),
 * GCC can map the entire w[] array to registers and this becomes
 * counterproductive.
 *
 * The optimal kludge seems to differ between x86 and ARM.  On the latter,
 * forcing a full memory barrier is required to stop GCC from splitting
 * live ranges for each round and generating a separate stack slot for
 * each.  (Which produces a huge stack frame and kills performance.)
 */
#if defined(__i386__) || defined(__x86_64__)
#define STORE(i, x) *(volatile uint32_t *)&W(i) = x
#elif __GNUC__ && defined(__arm__)
#define STORE(i, x) W(i) = x; __asm__("":::"memory")
#else
#define STORE(i, x) W(i) = x
#endif


/* The three round functions.  F2 is also used as F4 */
#define F1(b,c,d) (((d ^ c) & b) ^ d)		/* Bitwise b ? c : d */
#define F2(b,c,d) (d ^ c ^ b)			/* Even parity */
#define F3(b,c,d) (d & c) + ((d ^ c) & b)	/* Majority function */

/* The four round constants */
#define K1 0x5a827999	/* 2^30 * sqrt(2) */
#define K2 0x6ed9eba1	/* 2^30 * sqrt(3) */
#define K3 0x8f1bbcdc	/* 2^30 * sqrt(5) */
#define K4 0xca62c1d6	/* 2^30 * sqrt(10) */

/* Rounds 0..15 fetch a word from the input */
#define FETCH(t,i)	t = get_be32(p + 4*(i)); STORE(i,t)
/* Rounds 16..79 mix previous words to get a new one */
#define MIX(t,i)	t = W(i) ^ W(i+2) ^ W(i+8) ^ W(i+13); t = ROL(t, 1)
/* Rounds 16..76 have to store back the result */
#define CALC(t,i)	MIX(t,i); STORE(i,t)

/* The basic SHA-1 round */
#define SHA_ROUND(a,b,c,d,e,f,k,src,t,i)	\
	src(t,i);				\
	e += t + f(b,c,d) + k + ROL(a,5);	\
	b = ROR(b,2)

/* An aligned group of 5 rounds */
#define SHA_ROUND5(f,k,src,t,i) \
	SHA_ROUND(a,b,c,d,e, f,k,src,t,i);	\
	SHA_ROUND(e,a,b,c,d, f,k,src,t,i+1);	\
	SHA_ROUND(d,e,a,b,c, f,k,src,t,i+2);	\
	SHA_ROUND(c,d,e,a,b, f,k,src,t,i+3);	\
	SHA_ROUND(b,c,d,e,a, f,k,src,t,i+4)

/*
 * The core SHA-1 transform.
 *
 * iv[5] = Current SHA-1 hash state.
 * p = Pointer to source data.  Not necessarily aligned.
 * nblocks = Number of 64-byte blocks at p.  Guaranteed non-zero.
 */
static void
sha1_core(uint32_t iv[5], unsigned char const *p, size_t nblocks)
{
	uint32_t e = iv[4], d = iv[3], c = iv[2], b = iv[1], a = iv[0];
	uint32_t w[16];

	do {
		uint32_t t;

		SHA_ROUND5(F1, K1, FETCH, t,  0);
		SHA_ROUND5(F1, K1, FETCH, t,  5);
		SHA_ROUND5(F1, K1, FETCH, t, 10);
		SHA_ROUND(a,b,c,d,e, F1, K1, FETCH, t, 15);
		SHA_ROUND(e,a,b,c,d, F1, K1, CALC, t, 16);
		SHA_ROUND(d,e,a,b,c, F1, K1, CALC, t, 17);
		SHA_ROUND(c,d,e,a,b, F1, K1, CALC, t, 18);
		SHA_ROUND(b,c,d,e,a, F1, K1, CALC, t, 19);

		SHA_ROUND5(F2, K2, CALC, t, 20);
		SHA_ROUND5(F2, K2, CALC, t, 25);
		SHA_ROUND5(F2, K2, CALC, t, 30);
		SHA_ROUND5(F2, K2, CALC, t, 35);

		SHA_ROUND5(F3, K3, CALC, t, 40);
		SHA_ROUND5(F3, K3, CALC, t, 45);
		SHA_ROUND5(F3, K3, CALC, t, 50);
		SHA_ROUND5(F3, K3, CALC, t, 55);

		SHA_ROUND5(F2, K4, CALC, t, 60);
		SHA_ROUND5(F2, K4, CALC, t, 65);
		SHA_ROUND5(F2, K4, CALC, t, 70);

		SHA_ROUND(a,b,c,d,e, F2, K4, CALC, t, 75);
		SHA_ROUND(e,a,b,c,d, F2, K4, CALC, t, 76);
		/* Last 3 rounds don't need to store mixed value */
		SHA_ROUND(d,e,a,b,c, F2, K4, MIX, t, 77);
		SHA_ROUND(c,d,e,a,b, F2, K4, MIX, t, 78);
		SHA_ROUND(b,c,d,e,a, F2, K4, MIX, t, 79);

		iv[4] = e += iv[4];
		iv[3] = d += iv[3];
		iv[2] = c += iv[2];
		iv[1] = b += iv[1];
		iv[0] = a += iv[0];
	} while (--nblocks);
}


/* Compile with -DUNITTEST for self-tests */
#if UNITTEST

#include <stdio.h>

/* Known answer test */
static void katest(char const *text, size_t len, unsigned char const hash[20])
{
	SHA_CTX c;
	unsigned char hash2[20];
	int i;

	SHA1_Init(&c);
	SHA1_Update(&c, text, len);
	SHA1_Final(hash2, &c);

	for (i = 0; i < 20; i++)
		if (hash[i] != hash2[i])
			goto mismatch;
	printf("%u-byte known answer test PASSED\n", len);
	return;

mismatch:
	printf("%u-byte known answer test FAILED:\n", len);
	printf("Input: \"%.*s\"\n", len, text);
	printf("Computed:");
	for (i = 0; i < 20; i++)
		printf(" %02x", hash2[i]);
	printf("\nExpected:");
	for (i = 0; i < 20; i++)
		printf(" %02x", hash[i]);
	putchar('\n');
}

int
main(void)
{
	/* FIPS PUB 180.1 example A.1 */
	static char const text1[3] = "abc";
	static unsigned char const hash1[20] = {
		0xa9, 0x99, 0x3e, 0x36, 0x47, 0x06, 0x81, 0x6a, 0xba, 0x3e,
		0x25, 0x71, 0x78, 0x50, 0xc2, 0x6c, 0x9c, 0xd0, 0xd8, 0x9d };

	/* FIPS PUB 180.1 example A.2 */
	static char const text2[56] =
		"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
	static unsigned char const hash2[20] = {
		0x84, 0x98, 0x3e, 0x44, 0x1c, 0x3b, 0xd2, 0x6e, 0xba, 0xae,
		0x4a, 0xa1, 0xf9, 0x51, 0x29, 0xe5, 0xe5, 0x46, 0x70, 0xf1 };

	katest(text1, sizeof text1, hash1);
	katest(text2, sizeof text2, hash2);

	return 0;
}
#endif
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]