Re: [PATCH 00/11] writing out a huge blob to working tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, May 15, 2011 at 05:30:20PM -0700, Junio C Hamano wrote:

> Recently "diff" learned to avoid reading the contents only to say "Binary
> files differ" when these large blobs are marked as binary.

With your series, we should be able to get similar speedups even if the
user didn't explicitly mark a file as binary. We need only peek at the
beginning of a blob to see if it is binary, so we can be conservative
with big files. Something like this (which doesn't work because of the
"size" bug I mentioned elsewhere, but is meant to be illustrative):

diff --git a/diff.c b/diff.c
index ba5f7aa..bfe1b2d 100644
--- a/diff.c
+++ b/diff.c
@@ -15,6 +15,7 @@
 #include "sigchain.h"
 #include "submodule.h"
 #include "ll-merge.h"
+#include "streaming.h"
 
 #ifdef NO_FAST_WORKING_DIRECTORY
 #define FAST_WORKING_DIRECTORY 0
@@ -1931,6 +1932,37 @@ static void diff_filespec_load_driver(struct diff_filespec *one)
 		one->driver = userdiff_find_by_name("default");
 }
 
+static char *populate_or_peek(struct diff_filespec *df,
+			      unsigned long want,
+			      unsigned long *got)
+{
+	struct git_istream *st;
+	enum object_type type;
+	char *buf;
+
+	st = open_istream(df->sha1, &type, &df->size);
+	if (!st) {
+		diff_populate_filespec(df, 0);
+		*got = df->size;
+		return df->data;
+	}
+
+	if (df->size < big_file_threshold) {
+		buf = df->data = xmallocz(df->size);
+		want = df->size;
+		df->should_free = 1;
+	}
+	else
+		buf = xmallocz(want);
+
+	/* looks like it will always read_in_full? */
+	if (read_istream(st, buf, want) != want)
+		die("failed to read object");
+	close_istream(st);
+	*got = want;
+	return buf;
+}
+
 int diff_filespec_is_binary(struct diff_filespec *one)
 {
 	if (one->is_binary == -1) {
@@ -1938,13 +1970,25 @@ int diff_filespec_is_binary(struct diff_filespec *one)
 		if (one->driver->binary != -1)
 			one->is_binary = one->driver->binary;
 		else {
-			if (!one->data && DIFF_FILE_VALID(one))
-				diff_populate_filespec(one, 0);
-			if (one->data)
-				one->is_binary = buffer_is_binary(one->data,
-						one->size);
+			char *buf;
+			unsigned long size;
+
+			if (one->data) {
+				buf = one->data;
+				size = one->size;
+			}
+			else if (DIFF_FILE_VALID(one))
+				buf = populate_or_peek(one, 8192, &size);
+			else
+				buf = NULL;
+
+			if (buf)
+				one->is_binary = buffer_is_binary(buf, size);
 			if (one->is_binary == -1)
 				one->is_binary = 0;
+
+			if (buf != one->data)
+				free(buf);
 		}
 	}
 	return one->is_binary;

I think a "peek" function like this would be a nice addition to the
streaming API. Something like:

  char *peek_sha1(const unsigned char sha1[20], /* which object */
                  enum object_type *type, /* out: type */
                  unsigned long want, /* how much do we need */
                  unsigned long big, /* if less than this, just give us
                                        everything in the name of
                                        efficiency */
                  unsigned long *got, /* out: how much did we peek */
                  unsigned long *size, /* out: how big is the whole thing */
                  );

but maybe diff is the only place where that is useful. I dunno.

-Peff
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]