[PATCH 5/5] archive-zip: stream large blobs into zip file

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



A large blob will be read twice. One for calculating crc32, one for
actual writing. Large blobs are written uncompressed for simplicity.

Writing compressed large blobs is possible. But a naive implementation
would need to decompress/compress the blob twice: one to calculate
compressed size, one for actual writing, assuming compressed blobs are
still over large file limit.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 I think we could extract compressed size from pack index, then stream
 the compressed blob directly from pack to zip file. But that makes
 git-archive sensitive to pack format. And to be honest I don't care
 that much about large file support to do it. This patch is good
 enough for me.

 Documentation/git-archive.txt |    3 ++
 archive-zip.c                 |   42 ++++++++++++++++++++++++++++++++++++++++-
 t/t1050-large.sh              |    4 +++
 3 files changed, 48 insertions(+), 1 deletions(-)

diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt
index ac7006e..6df85a6 100644
--- a/Documentation/git-archive.txt
+++ b/Documentation/git-archive.txt
@@ -120,6 +120,9 @@ tar.<format>.remote::
 	user-defined formats, but true for the "tar.gz" and "tgz"
 	formats.
 
+core.bigFileThreshold::
+	Files larger than this size are stored uncompressed in zip format.
+
 ATTRIBUTES
 ----------
 
diff --git a/archive-zip.c b/archive-zip.c
index f8039ba..ee58bda 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -3,6 +3,7 @@
  */
 #include "cache.h"
 #include "archive.h"
+#include "streaming.h"
 
 static int zip_date;
 static int zip_time;
@@ -120,6 +121,29 @@ static void *zlib_deflate(void *data, unsigned long size,
 	return buffer;
 }
 
+static int crc32_stream(const unsigned char *sha1, unsigned long *crc)
+{
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		return error("cannot stream blob %s", sha1_to_hex(sha1));
+	for (;;) {
+		char buf[1024];
+		ssize_t readlen;
+
+		readlen = read_istream(st, buf, sizeof(buf));
+
+		if (readlen <= 0)
+			return readlen;
+		*crc = crc32(*crc, (unsigned char*)buf, readlen);
+	}
+	close_istream(st);
+	return 0;
+}
+
 static int write_zip_entry(struct archiver_args *args,
 			   const unsigned char *sha1,
 			   const char *path, size_t pathlen,
@@ -153,6 +177,19 @@ static int write_zip_entry(struct archiver_args *args,
 		compressed_size = 0;
 		buffer = NULL;
 		size = 0;
+	} else if (!args->convert && S_ISREG(mode) &&
+		      sha1_object_info(sha1, &size) == OBJ_BLOB &&
+		      size > big_file_threshold) {
+		buffer = NULL;
+		method = 0;
+		attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) :
+			(mode & 0111) ? ((mode) << 16) : 0;
+		if (crc32_stream(sha1, &crc) < 0)
+			return error("failed to calculate crc32 from blob %s, SHA1 %s",
+				     path, sha1_to_hex(sha1));
+		out = buffer;
+		uncompressed_size = size;
+		compressed_size = size;
 	} else if (S_ISREG(mode) || S_ISLNK(mode)) {
 		enum object_type type;
 		buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size);
@@ -234,7 +271,10 @@ static int write_zip_entry(struct archiver_args *args,
 	write_or_die(1, path, pathlen);
 	zip_offset += pathlen;
 	if (compressed_size > 0) {
-		write_or_die(1, out, compressed_size);
+		if (out)
+			write_or_die(1, out, compressed_size);
+		else
+			stream_blob_to_fd(1, sha1, NULL, 0);
 		zip_offset += compressed_size;
 	}
 
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index fe47554..458fdde 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -138,4 +138,8 @@ test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
 
+test_expect_success 'zip achiving' '
+	git archive --format=zip HEAD >/dev/null
+'
+
 test_done
-- 
1.7.8.36.g69ee2

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]