unpack_raw_entry() will not allocate and return decompressed blobs if they are larger than core.bigFileThreshold. The blob content is needed by sha1_object() in some cases. When we do need the blob content, we put it back in core with get_data_from_pack(). However we rarely need that in pratice. The first case is when we find an in-repo blob with the same SHA-1. We need to do collision test, byte-on-byte. Normally (e.g. in fetch/pull/clone) this does not happen because git avoid to send objects that client already has. The other case is when --strict is specified and the object in question is not a blob, which can't happen in reality becase we deal with large _blobs_ here. Note: --verify (or git-verify-pack) a pack from current repository will trigger collision test on every object in the pack, which effectively disables this patch. This could be easily worked around by setting GIT_DIR to an imaginary place with no packs. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> --- builtin/index-pack.c | 52 ++++++++++++++++++++++++++++++++++++++++++------- t/t1050-large.sh | 5 ++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index ccb0214..1b790df 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type) static void *unpack_entry_data(unsigned long offset, unsigned long size, enum object_type type, unsigned char *sha1) { + static char fixed_buf[8192]; int status; git_zstream stream; - void *buf = xmalloc(size); + void *buf; git_SHA_CTX c; char hdr[32]; int hdrlen; @@ -406,11 +407,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size, git_SHA1_Update(&c, hdr, hdrlen); } else sha1 = NULL; + if (type == OBJ_BLOB && size > big_file_threshold) + buf = fixed_buf; + else + buf = xmalloc(size); memset(&stream, 0, sizeof(stream)); git_inflate_init(&stream); stream.next_out = buf; - stream.avail_out = size; + stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size; last_out = buf; do { @@ -420,6 +425,10 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size, use(input_len - stream.avail_in); if (sha1) git_SHA1_Update(&c, last_out, stream.next_out - last_out); + if (buf == fixed_buf) { + stream.next_out = buf; + stream.avail_out = sizeof(fixed_buf); + } last_out = stream.next_out; } while (status == Z_OK); if (stream.total_out != size || status != Z_STREAM_END) @@ -427,7 +436,7 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size, git_inflate_end(&stream); if (sha1) git_SHA1_Final(sha1, &c); - return buf; + return buf == fixed_buf ? NULL : buf; } static void *unpack_raw_entry(struct object_entry *obj, @@ -593,14 +602,21 @@ static void find_delta_children(const union delta_base *base, *last_index = last; } -static void sha1_object(const void *data, unsigned long size, - enum object_type type, const unsigned char *sha1) +static void sha1_object(const void *data, struct object_entry *obj_entry, + unsigned long size, enum object_type type, + const unsigned char *sha1) { + void *new_data = NULL; + + assert(data || obj_entry); + read_lock(); if (has_sha1_file(sha1)) { void *has_data; enum object_type has_type; unsigned long has_size; + if (!data) + data = new_data = get_data_from_pack(obj_entry); has_data = read_sha1_file(sha1, &has_type, &has_size); read_unlock(); if (!has_data) @@ -625,6 +641,9 @@ static void sha1_object(const void *data, unsigned long size, int eaten; void *buf = (void *) data; + if (!buf) + buf = new_data = get_data_from_pack(obj_entry); + /* * we do not need to free the memory here, as the * buf is deleted by the caller. @@ -649,6 +668,8 @@ static void sha1_object(const void *data, unsigned long size, } read_unlock(); } + + free(new_data); } /* @@ -732,7 +753,7 @@ static void resolve_delta(struct object_entry *delta_obj, bad_object(delta_obj->idx.offset, _("failed to apply delta")); hash_sha1_file(result->data, result->size, typename(delta_obj->real_type), delta_obj->idx.sha1); - sha1_object(result->data, result->size, delta_obj->real_type, + sha1_object(result->data, NULL, result->size, delta_obj->real_type, delta_obj->idx.sha1); counter_lock(); nr_resolved_deltas++; @@ -862,7 +883,7 @@ static void *threaded_second_pass(void *data) */ static void parse_pack_objects(unsigned char *sha1) { - int i; + int i, nr_delays = 0; struct delta_entry *delta = deltas; struct stat st; @@ -878,8 +899,12 @@ static void parse_pack_objects(unsigned char *sha1) nr_deltas++; delta->obj_no = i; delta++; + } else if (!data) { + /* large blobs, check later */ + obj->real_type = OBJ_BAD; + nr_delays++; } else - sha1_object(data, obj->size, obj->type, obj->idx.sha1); + sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1); free(data); display_progress(progress, i+1); } @@ -899,6 +924,17 @@ static void parse_pack_objects(unsigned char *sha1) if (S_ISREG(st.st_mode) && lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size) die(_("pack has junk at the end")); + + for (i = 0; i < nr_objects; i++) { + struct object_entry *obj = &objects[i]; + if (obj->real_type != OBJ_BAD) + continue; + obj->real_type = obj->type; + sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1); + nr_delays--; + } + if (nr_delays) + die(_("confusion beyond insanity in parse_pack_objects()")); } /* diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 55ed955..3f80688 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' ' ' +test_expect_success 'index-pack' ' + git clone file://"`pwd`"/.git foo && + GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack +' + test_expect_success 'repack' ' git repack -ad ' -- 1.7.8.36.g69ee2 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html