git usually streams large blobs directly to packs. But there are cases where git can create large loose blobs (unpack-objects or hash-object over pipe). Or they can come from other git implementations. core.bigfilethreshold can also be lowered down and introduce a new wave of large loose blobs. Use streaming interface to read these blobs and compress/write at the same time. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> --- index-pack's streaming support is on the way. unpack-objects is another story because I'm thinking of merging it back to index-pack first, which may take more than one release cycle. builtin/pack-objects.c | 73 ++++++++++++++++++++++++++++++++++++++++++++---- t/t1050-large.sh | 16 ++++++++++ 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 1861093..98b51c1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -16,6 +16,7 @@ #include "list-objects.h" #include "progress.h" #include "refs.h" +#include "streaming.h" #include "thread-utils.h" static const char *pack_usage[] = { @@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size) return stream.total_out; } +static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1) +{ + git_zstream stream; + unsigned char ibuf[1024 * 16]; + unsigned char obuf[1024 * 16]; + int zret; + + struct git_istream *st; + enum object_type type; + unsigned long sz; + + st = open_istream(sha1, &type, &sz, NULL); + if (!st) + die(_("failed to read %s"), sha1_to_hex(sha1)); + + memset(&stream, 0, sizeof(stream)); + git_deflate_init(&stream, pack_compression_level); + + if (type != OBJ_BLOB) + die("BUG: %s is not a blob", sha1_to_hex(sha1)); + + for (;;) { + ssize_t readlen; + readlen = read_istream(st, ibuf, sizeof(ibuf)); + if (readlen == -1) + die(_("failed to read %s"), sha1_to_hex(sha1)); + + stream.next_in = ibuf; + stream.avail_in = readlen; + zret = Z_OK; + while ((stream.avail_in || readlen == 0) && + (zret == Z_OK || zret == Z_BUF_ERROR)) { + stream.next_out = obuf; + stream.avail_out = sizeof(obuf); + zret = git_deflate(&stream, readlen ? 0 : Z_FINISH); + sha1write(f, obuf, stream.next_out - obuf); + } + if (stream.avail_in) + die(_("deflate error (%d)"), zret); + if (readlen == 0) { + if (zret != Z_STREAM_END) + die(_("deflate error (%d)"), zret); + break; + } + } + close_istream(st); + git_deflate_end(&stream); +} + /* * we are going to reuse the existing object data as is. make * sure it is not corrupt. @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f, if (!to_reuse) { no_reuse: if (!usable_delta) { - buf = read_sha1_file(entry->idx.sha1, &type, &size); - if (!buf) - die("unable to read %s", sha1_to_hex(entry->idx.sha1)); + type = sha1_object_info(entry->idx.sha1, &size); + if (type == OBJ_BLOB && size > big_file_threshold) + buf = NULL; + else { + buf = read_sha1_file(entry->idx.sha1, &type, &size); + if (!buf) + die("unable to read %s", sha1_to_hex(entry->idx.sha1)); + } /* * make sure no cached delta data remains from a * previous attempt before a pack split occurred. @@ -284,8 +339,11 @@ static unsigned long write_object(struct sha1file *f, if (entry->z_delta_size) datalen = entry->z_delta_size; - else + else if (buf) datalen = do_compress(&buf, size); + else + /* large blob case, just assume we don't compress well */ + datalen = size; /* * The object header is a byte of 'type' followed by zero or @@ -330,8 +388,11 @@ static unsigned long write_object(struct sha1file *f, } sha1write(f, header, hdrlen); } - sha1write(f, buf, datalen); - free(buf); + if (buf) { + sha1write(f, buf, datalen); + free(buf); + } else + write_large_blob_data(f, entry->idx.sha1); } else { struct packed_git *p = entry->in_pack; diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 55ed955..7fbd2e1 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -134,6 +134,22 @@ test_expect_success 'repack' ' git repack -ad ' +test_expect_success 'pack-objects with large loose object' ' + echo Z | dd of=large4 bs=1k seek=2000 && + OBJ=9f36d94e145816ec642592c09cc8e601d83af157 && + P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 && + ( + unset GIT_ALLOC_LIMIT && + cat large4 | git hash-object -w --stdin && + git cat-file blob $OBJ >actual && + cmp large4 actual + ) && + echo $OBJ | git pack-objects .git/objects/pack/pack && + rm $P && + git cat-file blob $OBJ >actual && + cmp large4 actual +' + test_expect_success 'tar achiving' ' git archive --format=tar HEAD >/dev/null ' -- 1.7.8.36.g69ee2 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html