[PATCH 7/6] zlib: allow feeding more than 4GB in one go

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Junio C Hamano <gitster@xxxxxxxxx> writes:

> The next step would be to tweak zlib_post_call(), git_inflate() and
> git_deflate() functions to internally loop and call underlying inflate()
> and deflate() when the incoming buffers are larger than 4GB, but that
> part is not done in this series (yet).

And this is such a patch.

I had a draft of this one ready when I sent the previous 6-patch series,
but it took embarrassingly long time for me to figure out what went wrong
in this patch before I realized that Z_FINISH should not be relayed to the
underlying inflate/deflate if we are splitting the request into multiple
phases.

I consider it still rough, but with this on top of v1.7.5.4, together with
the previous 6 patches, it seems to pass a trivial "have a large blob,
add, commit, pack, verify, index-pack, fsck" tests, without using either
of the recent "let's pass large blob to fast-import" (in 'master') nor
"let's stream a blob out without expanding it in core first" (in 'next')
changes.

-- test script to be stored in /var/tmp/junk or somewhere --
#!/bin/sh

set -x

rm -fr .git && git init || exit

echo | dd bs=1M seek=4800 of=a.big
sleep 2
git add a.big

eval $(git ls-files -s a.big |
    sed -e 's/^[0-7]* \([0-9a-f][0-9a-f]\)\([0-9a-f]*\) .*/h=\1 a=\2/')
echo $h $a

ls -l .git/objects/$h/$a || exit

git commit -m initial || exit

git fsck || exit

git repack -a -d || exit

git fsck || exit

f=$(find .git/objects/?? -type f)
test -z "$f" || exit

p=$(find .git/objects/pack -type f -name '*.pack') &&
i=${p%.pack}.idx &&

test -f $p &&
test -f $i || exit

mv $p $i . &&
pp=$(basename $p) &&
ii=$(basename $i) &&
test -f $pp &&
test -f $ii || exit

git verify-pack $pp || exit

git unpack-objects <$pp || exit

ls -l .git/objects/$h/$a || exit

rm -fr .git/objects/?? || exit

git index-pack --stdin <$pp || exit
test -f "$i" || exit
cmp $i $ii || exit

git cat-file blob "$h$a" >b.big || exit
cmp a.big b.big

echo all done.
exit 0
-- test script ends here --

-- >8 --
Update zlib_post_call() that adjusts the wrapper's notion of avail_in and
avail_out to what came back from zlib, so that the callers can feed
buffers larger than than 4GB to the API.

When underlying inflate/deflate stopped processing because we fed a buffer
larger than 4GB limit, detect that case, update the state variables, and
let the zlib function work another round.

Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx>
---
 zlib.c |   78 +++++++++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/zlib.c b/zlib.c
index fe537e3..3c63d48 100644
--- a/zlib.c
+++ b/zlib.c
@@ -27,12 +27,11 @@ static const char *zerr_to_string(int status)
  * limits the size of the buffer we can use to 4GB when interacting
  * with zlib in a single call to inflate/deflate.
  */
-#define ZLIB_BUF_MAX ((uInt)-1)
+/* #define ZLIB_BUF_MAX ((uInt)-1) */
+#define ZLIB_BUF_MAX ((uInt) 1024 * 1024 * 1024) /* 1GB */
 static inline uInt zlib_buf_cap(unsigned long len)
 {
-	if (ZLIB_BUF_MAX < len)
-		die("working buffer for zlib too large");
-	return len;
+	return (ZLIB_BUF_MAX < len) ? ZLIB_BUF_MAX : len;
 }
 
 static void zlib_pre_call(git_zstream *s)
@@ -47,12 +46,22 @@ static void zlib_pre_call(git_zstream *s)
 
 static void zlib_post_call(git_zstream *s)
 {
+	unsigned long bytes_consumed;
+	unsigned long bytes_produced;
+
+	bytes_consumed = s->z.next_in - s->next_in;
+	bytes_produced = s->z.next_out - s->next_out;
+	if (s->z.total_out != s->total_out + bytes_produced)
+		die("BUG: total_out mismatch");
+	if (s->z.total_in != s->total_in + bytes_consumed)
+		die("BUG: total_in mismatch");
+
+	s->total_out = s->z.total_out;
+	s->total_in = s->z.total_in;
 	s->next_in = s->z.next_in;
 	s->next_out = s->z.next_out;
-	s->total_in = s->z.total_in;
-	s->total_out = s->z.total_out;
-	s->avail_in = s->z.avail_in;
-	s->avail_out = s->z.avail_out;
+	s->avail_in -= bytes_consumed;
+	s->avail_out -= bytes_produced;
 }
 
 void git_inflate_init(git_zstream *strm)
@@ -103,18 +112,32 @@ int git_inflate(git_zstream *strm, int flush)
 {
 	int status;
 
-	zlib_pre_call(strm);
-	status = inflate(&strm->z, flush);
-	zlib_post_call(strm);
+	for (;;) {
+		zlib_pre_call(strm);
+		/* Never say Z_FINISH unless we are feeding everything */
+		status = inflate(&strm->z,
+				 (strm->z.avail_in != strm->avail_in)
+				 ? 0 : flush);
+		if (status == Z_MEM_ERROR)
+			die("inflate: out of memory");
+		zlib_post_call(strm);
+
+		/*
+		 * Let zlib work another round, while we can still
+		 * make progress.
+		 */
+		if ((strm->avail_out && !strm->z.avail_out) &&
+		    (status == Z_OK || status == Z_BUF_ERROR))
+			continue;
+		break;
+	}
+
 	switch (status) {
 	/* Z_BUF_ERROR: normal, needs more space in the output buffer */
 	case Z_BUF_ERROR:
 	case Z_OK:
 	case Z_STREAM_END:
 		return status;
-
-	case Z_MEM_ERROR:
-		die("inflate: out of memory");
 	default:
 		break;
 	}
@@ -192,18 +215,33 @@ int git_deflate(git_zstream *strm, int flush)
 {
 	int status;
 
-	zlib_pre_call(strm);
-	status = deflate(&strm->z, flush);
-	zlib_post_call(strm);
+	for (;;) {
+		zlib_pre_call(strm);
+
+		/* Never say Z_FINISH unless we are feeding everything */
+		status = deflate(&strm->z,
+				 (strm->z.avail_in != strm->avail_in)
+				 ? 0 : flush);
+		if (status == Z_MEM_ERROR)
+			die("deflate: out of memory");
+		zlib_post_call(strm);
+
+		/*
+		 * Let zlib work another round, while we can still
+		 * make progress.
+		 */
+		if ((strm->avail_out && !strm->z.avail_out) &&
+		    (status == Z_OK || status == Z_BUF_ERROR))
+			continue;
+		break;
+	}
+
 	switch (status) {
 	/* Z_BUF_ERROR: normal, needs more space in the output buffer */
 	case Z_BUF_ERROR:
 	case Z_OK:
 	case Z_STREAM_END:
 		return status;
-
-	case Z_MEM_ERROR:
-		die("deflate: out of memory");
 	default:
 		break;
 	}
-- 
1.7.6.rc1.118.ge175b4a
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]