On Sunday 30 May 2010, Johannes Sixt wrote: > On Sonntag, 30. Mai 2010, Johan Herland wrote: > > +cat >expect_initial <<EOF > > +100644 blob 51d2738463ea4ca66f8691c91e33ce64b7d41bb1 foo > > +EOF > > + > > +cat >expect_update <<EOF > > +100644 blob 51d2738efb4ad8a1e40bed839ab8e116f0a15e47 foo > > +EOF > > + > > +test_expect_success 'setup' ' > > + echo 4827 > foo && > > ... > > > + echo 11742 > foo && > > How the fscking hell did you find these two simple values that are an > almost-SHA1-collision? It's easier to hit the jackpot!?! Finding matches in the intial 7 hex chars (== 28 bits) of a SHA1 isn't _that_ hard. In this case, I used the attached hack/program[1], which found the above values in <0.9 seconds, albeit you need ~1 GB free RAM to run it[2]. The program simply increments a 32-bit integer, and produces simple (but unique) blob objects (merely containing the 32-bit integer in decimal format) for each integer, then hashes that blob object and stores the original integer in a reverse dictionary (actually, a 2^28-entry array), keyed/indexed by the first 28 bits of the hash. Then, repeat until we find an integer whose blob hashes to the same location as a previous integer. Since we're using a 32-bit integer to generate inputs to a 2^28-entry array, we're bound to find collisions long before the integer overflows. FTR, there are already several almost-collisions in real-world repos. I first found some in repos at my $DAYJOB, but there are also multiple cases in the git.git repo. The following command will list all 7-char ambiguities in a packfile: git verify-pack -v $pack.idx | cut -c1-7 | uniq -d Have fun! :) ...Johan [1]: Put the attached file in your git.git checkout and compile it with: gcc -o find_sha_dup find_sha_dup.c block-sha1/sha1.c [2]: Double that for each additional bit you want to crack. I.e. cracking the first 8 hex chars (== 32 bits) would require ~16 GB free RAM. I'm sure there are more efficient ways of doing these things... -- Johan Herland, <johan@xxxxxxxxxxx> www.herland.net
#include <stdint.h> #include <stdlib.h> #include <string.h> #include <stdio.h> #include "block-sha1/sha1.h" /* Find SHA1 collisions within the first 7 hex chars (== 28 bits) */ #define num_entries (1 << 28) /* Need room for 2 ^ 28 entries */ uint32_t a[num_entries]; const char *str (uint32_t x, uint32_t *len) { /* Generate unique, short, readable string from integer */ #define buf_len 15 static char buf[buf_len]; int l; l = snprintf(buf, buf_len, "%u\n", x); if (l >= buf_len || l < 0) { printf("FAIL! l == %u\n", l); exit(1); } if (len) *len = l; return buf; } const unsigned char *sha1 (const char *s, size_t len) { static blk_SHA_CTX sha1_ctx; static unsigned char sha1_result[20]; char hdr[32]; int hdrlen; /* Make it look like a Git blob object */ hdrlen = sprintf(hdr, "blob %lu", len) + 1; blk_SHA1_Init(&sha1_ctx); blk_SHA1_Update(&sha1_ctx, hdr, hdrlen); blk_SHA1_Update(&sha1_ctx, s, len); blk_SHA1_Final(sha1_result, &sha1_ctx); return sha1_result; } const unsigned char *sha1_x (uint32_t x) { uint32_t len = 0; const char *s = str(x, &len); return sha1(s, len); } uint32_t a_index (const unsigned char *h) { uint32_t ret = (h[0] << 20) | (h[1] << 12) | (h[2] << 4) | (h[3] >> 4); return ret; } char *sha1_to_hex(const unsigned char *sha1) { static char buffer[41]; static const char hex[] = "0123456789abcdef"; int i; char *buf = buffer; for (i = 0; i < 20; i++) { unsigned int val = *sha1++; *buf++ = hex[val >> 4]; *buf++ = hex[val & 0xf]; } *buf = '\0'; return buffer; } int main (void) { uint32_t x = 0, i; memset(a, 0xff, num_entries * sizeof(uint32_t)); while (x != 0xffffffff) { i = a_index(sha1_x(x)); if (a[i] == 0xffffffff) { a[i] = x++; continue; } /* Found collision! */ uint32_t y = a[i]; printf("Found collision between entries %u and %u\n", y, x); printf("\t %u == '%s' => %s\n", y, str(y, NULL), sha1_to_hex(sha1_x(y))); printf("\t %u == '%s' => %s\n", x, str(x, NULL), sha1_to_hex(sha1_x(x))); return 1; } return 0; }