Hi folks,
This patch adds a proof-of-concept implementation of streaming SHA1
calculation in sha1_file.c, as demoed with `git hash-object <large
input file>`. Instead of the command's memory footprint being equal to
the input file's size, this caps it at SHA1_CHUNK_SIZE (currently 64MB).
Capping memory use this easily seems like a win, but then all this
code does is stream-calculate a SHA1 and print it to stdout. There
seem to be a lot of disparate places throughout the codebase where
objects have their SHA1 calculated.
Then again, I presume most of these are working with blobs and not
entire files, and hence wouldn't require streaming anyway. (I'm
assuming blobs don't grow large enough to warrant it - is that
necessarily true?)
The memory usage can be verified by running
while true; do ps aux | grep hash-object | grep -v grep; sleep 0.2; done
and then running `git hash-object <large input file>` in a second
terminal. The memory use stays at or below SHA1_CHUNK_SIZE until the
streamed hash is printed on the terminal and the non-streamed hash is
subsequently calculated.
On my machine, the original implementation hashed a 700MB file in
5.8sec. My patch does it in 6.2sec with SHA1_CHUNK_SIZE set to 64MB.
Cheers
Ben Hoskings
---
sha1_file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 47 insertions(+), 0 deletions(-)
diff --git a/sha1_file.c b/sha1_file.c
index 5b6e0f6..59f0adb 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -33,6 +33,8 @@ static unsigned long sz_fmt(size_t s) { return
(unsigned long)s; }
static size_t sz_fmt(size_t s) { return s; }
#endif
+#define SHA1_CHUNK_SIZE (size_t)(1024*1024*64)
+
const unsigned char null_sha1[20];
const signed char hexval_table[256] = {
@@ -2242,6 +2244,39 @@ static void write_sha1_file_prepare(const void
*buf, unsigned long len,
git_SHA1_Final(sha1, &c);
}
+inline void write_sha1_fd_process_chunk(int fd, unsigned long len,
+ unsigned long offset,
git_SHA_CTX *c,
+ void *buf)
+{
+ buf = xmmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, offset);
+ git_SHA1_Update(c, buf, len);
+ munmap(buf, len);
+}
+
+static void write_sha1_fd_prepare(int fd, unsigned long len,
+ const char *type, unsigned char
*sha1,
+ char *hdr, int *hdrlen)
+{
+ git_SHA_CTX c;
+ void *buf = NULL;
+ unsigned long offset = 0;
+
+ *hdrlen = sprintf(hdr, "%s %lu", type, len)+1;
+
+ git_SHA1_Init(&c);
+ git_SHA1_Update(&c, hdr, *hdrlen);
+
+ for (; offset + SHA1_CHUNK_SIZE <= len; offset += SHA1_CHUNK_SIZE) {
+ write_sha1_fd_process_chunk(fd, SHA1_CHUNK_SIZE, offset, &c, buf);
+ }
+
+ if (len % SHA1_CHUNK_SIZE) {
+ write_sha1_fd_process_chunk(fd, len % SHA1_CHUNK_SIZE, offset, &c,
buf);
+ }
+
+ git_SHA1_Final(sha1, &c);
+}
+
/*
* Move the just written object into its final resting place
*/
@@ -2294,6 +2329,15 @@ int hash_sha1_file(const void *buf, unsigned
long len, const char *type,
return 0;
}
+int hash_sha1_fd(int fd, unsigned long len, const char *type,
+ unsigned char *sha1)
+{
+ char hdr[32];
+ int hdrlen;
+ write_sha1_fd_prepare(fd, len, type, sha1, hdr, &hdrlen);
+ return 0;
+}
+
/* Finalize a file on disk, and close it. */
static void close_sha1_file(int fd)
{
@@ -2523,6 +2567,9 @@ int index_fd(unsigned char *sha1, int fd, struct
stat *st, int write_object,
ret = -1;
strbuf_release(&sbuf);
} else if (size) {
+ hash_sha1_fd(fd, size, typename(type), sha1);
+ printf("%s <- chunked hash\n", sha1_to_hex(sha1));
+
void *buf = xmmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
ret = index_mem(sha1, buf, size, write_object, type, path);
munmap(buf, size);
--
1.6.1.2
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html