[PATCH 6/7] diff: cache textconv output

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Running a textconv filter can take a long time. It's
particularly bad for a large file which needs to be spooled
to disk, but even for small files, the fork+exec overhead
can add up for something like "git log -p".

This patch uses the notes-cache mechanism to keep a fast
cache of textconv output. Caches are stored in
refs/notes/textconv/$x, where $x is the userdiff driver
defined in gitattributes.

Caching is enabled only if diff.$x.cachetextconv is true.

In my test repo, on a commit with 45 jpg and avi files
changed and a textconv to show their exif tags:

  [before]
  $ time git show >/dev/null
  real    0m13.724s
  user    0m12.057s
  sys     0m1.624s

  [after, first run]
  $ git config diff.mfo.cachetextconv true
  $ time git show >/dev/null
  real    0m14.252s
  user    0m12.197s
  sys     0m1.800s

  [after, subsequent runs]
  $ time git show >/dev/null
  real    0m0.352s
  user    0m0.148s
  sys     0m0.200s

So for a slight (3.8%) cost on the first run, we achieve an
almost 40x speed up on subsequent runs.

Signed-off-by: Jeff King <peff@xxxxxxxx>
---
The moral equivalent of what I posted earlier, but most of the heavy
lifting is in notes-cache.[ch] from earlier in the series.

It's off by default. I think it should probably swithc to on by default,
but I figured we could let the feature mature a bit first.

 Documentation/gitattributes.txt  |   20 +++++++
 diff.c                           |   52 +++++++++++++++---
 t/t4042-diff-textconv-caching.sh |  109 ++++++++++++++++++++++++++++++++++++++
 userdiff.c                       |    9 +++
 userdiff.h                       |    4 ++
 5 files changed, 185 insertions(+), 9 deletions(-)
 create mode 100755 t/t4042-diff-textconv-caching.sh

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index d892e64..a8500d1 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you
 should generate it separately and send it as a comment _in
 addition to_ the usual binary diff that you might send.
 
+Because text conversion can be slow, especially when doing a
+large number of them with `git log -p`, git provides a mechanism
+to cache the output and use it in future diffs.  To enable
+caching, set the "cachetextconv" variable in your diff driver's
+config. For example:
+
+------------------------
+[diff "jpg"]
+	textconv = exif
+	cachetextconv = true
+------------------------
+
+This will cache the result of running "exif" on each blob
+indefinitely. If you change the textconv config variable for a
+diff driver, git will automatically invalidate the cache entries
+and re-run the textconv filter. If you want to invalidate the
+cache manually (e.g., because your version of "exif" was updated
+and now produces better output), you can remove the cache
+manually with `git update-ref -d refs/notes/textconv/jpg` (where
+"jpg" is the name of the diff driver, as in the example above).
 
 Performing a three-way merge
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/diff.c b/diff.c
index 9665d6d..72d8503 100644
--- a/diff.c
+++ b/diff.c
@@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = {
 };
 
 static void diff_filespec_load_driver(struct diff_filespec *one);
-static size_t fill_textconv(const char *cmd,
+static size_t fill_textconv(struct userdiff_driver *driver,
 			    struct diff_filespec *df, char **outbuf);
 
 static int parse_diff_color_slot(const char *var, int ofs)
@@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a,
 			      const char *name_b,
 			      struct diff_filespec *one,
 			      struct diff_filespec *two,
-			      const char *textconv_one,
-			      const char *textconv_two,
+			      struct userdiff_driver *textconv_one,
+			      struct userdiff_driver *textconv_two,
 			      struct diff_options *o)
 {
 	int lc_a, lc_b;
@@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const
 		options->b_prefix = b;
 }
 
-static const char *get_textconv(struct diff_filespec *one)
+static struct userdiff_driver *get_textconv(struct diff_filespec *one)
 {
 	if (!DIFF_FILE_VALID(one))
 		return NULL;
 	if (!S_ISREG(one->mode))
 		return NULL;
 	diff_filespec_load_driver(one);
-	return one->driver->textconv;
+	if (!one->driver->textconv)
+		return NULL;
+
+	if (one->driver->textconv_want_cache && !one->driver->textconv_cache) {
+		struct notes_cache *c = xmalloc(sizeof(*c));
+		struct strbuf name = STRBUF_INIT;
+
+		strbuf_addf(&name, "textconv/%s", one->driver->name);
+		notes_cache_init(c, name.buf, one->driver->textconv);
+		one->driver->textconv_cache = c;
+	}
+
+	return one->driver;
 }
 
 static void builtin_diff(const char *name_a,
@@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a,
 	const char *set = diff_get_color_opt(o, DIFF_METAINFO);
 	const char *reset = diff_get_color_opt(o, DIFF_RESET);
 	const char *a_prefix, *b_prefix;
-	const char *textconv_one = NULL, *textconv_two = NULL;
+	struct userdiff_driver *textconv_one = NULL;
+	struct userdiff_driver *textconv_two = NULL;
 	struct strbuf header = STRBUF_INIT;
 
 	if (DIFF_OPT_TST(o, SUBMODULE_LOG) &&
@@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec,
 	return strbuf_detach(&buf, outsize);
 }
 
-static size_t fill_textconv(const char *cmd,
+static size_t fill_textconv(struct userdiff_driver *driver,
 			    struct diff_filespec *df,
 			    char **outbuf)
 {
 	size_t size;
 
-	if (!cmd) {
+	if (!driver || !driver->textconv) {
 		if (!DIFF_FILE_VALID(df)) {
 			*outbuf = "";
 			return 0;
@@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd,
 		return df->size;
 	}
 
-	*outbuf = run_textconv(cmd, df, &size);
+	if (driver->textconv_cache) {
+		*outbuf = notes_cache_get(driver->textconv_cache, df->sha1,
+					  &size);
+		if (*outbuf)
+			return size;
+	}
+
+	*outbuf = run_textconv(driver->textconv, df, &size);
 	if (!*outbuf)
 		die("unable to read files to diff");
+
+	if (driver->textconv_cache) {
+		/* ignore errors, as we might be in a readonly repository */
+		notes_cache_put(driver->textconv_cache, df->sha1, *outbuf,
+				size);
+		/*
+		 * we could save up changes and flush them all at the end,
+		 * but we would need an extra call after all diffing is done.
+		 * Since generating a cache entry is the slow path anyway,
+		 * this extra overhead probably isn't a big deal.
+		 */
+		notes_cache_write(driver->textconv_cache);
+	}
+
 	return size;
 }
diff --git a/t/t4042-diff-textconv-caching.sh b/t/t4042-diff-textconv-caching.sh
new file mode 100755
index 0000000..91f8198
--- /dev/null
+++ b/t/t4042-diff-textconv-caching.sh
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+test_description='test textconv caching'
+. ./test-lib.sh
+
+cat >helper <<'EOF'
+#!/bin/sh
+sed 's/^/converted: /' "$@" >helper.out
+cat helper.out
+EOF
+chmod +x helper
+
+test_expect_success 'setup' '
+	echo foo content 1 >foo.bin &&
+	echo bar content 1 >bar.bin &&
+	git add . &&
+	git commit -m one &&
+	echo foo content 2 >foo.bin &&
+	echo bar content 2 >bar.bin &&
+	git commit -a -m two &&
+	echo "*.bin diff=magic" >.gitattributes &&
+	git config diff.magic.textconv ./helper &&
+	git config diff.magic.cachetextconv true
+'
+
+cat >expect <<EOF
+diff --git a/bar.bin b/bar.bin
+index fcf9166..28283d5 100644
+--- a/bar.bin
++++ b/bar.bin
+@@ -1 +1 @@
+-converted: bar content 1
++converted: bar content 2
+diff --git a/foo.bin b/foo.bin
+index d5b9fe3..1345db2 100644
+--- a/foo.bin
++++ b/foo.bin
+@@ -1 +1 @@
+-converted: foo content 1
++converted: foo content 2
+EOF
+
+test_expect_success 'first textconv works' '
+	git diff HEAD^ HEAD >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'cached textconv produces same output' '
+	git diff HEAD^ HEAD >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'cached textconv does not run helper' '
+	rm -f helper.out &&
+	git diff HEAD^ HEAD >actual &&
+	test_cmp expect actual &&
+	! test -r helper.out
+'
+
+cat >expect <<EOF
+diff --git a/bar.bin b/bar.bin
+index fcf9166..28283d5 100644
+--- a/bar.bin
++++ b/bar.bin
+@@ -1,2 +1,2 @@
+ converted: other
+-converted: bar content 1
++converted: bar content 2
+diff --git a/foo.bin b/foo.bin
+index d5b9fe3..1345db2 100644
+--- a/foo.bin
++++ b/foo.bin
+@@ -1,2 +1,2 @@
+ converted: other
+-converted: foo content 1
++converted: foo content 2
+EOF
+test_expect_success 'changing textconv invalidates cache' '
+	echo other >other &&
+	git config diff.magic.textconv "./helper other" &&
+	git diff HEAD^ HEAD >actual &&
+	test_cmp expect actual
+'
+
+cat >expect <<EOF
+diff --git a/bar.bin b/bar.bin
+index fcf9166..28283d5 100644
+--- a/bar.bin
++++ b/bar.bin
+@@ -1,2 +1,2 @@
+ converted: other
+-converted: bar content 1
++converted: bar content 2
+diff --git a/foo.bin b/foo.bin
+index d5b9fe3..1345db2 100644
+--- a/foo.bin
++++ b/foo.bin
+@@ -1 +1 @@
+-converted: foo content 1
++converted: foo content 2
+EOF
+test_expect_success 'switching diff driver produces correct results' '
+	git config diff.moremagic.textconv ./helper &&
+	echo foo.bin diff=moremagic >>.gitattributes &&
+	git diff HEAD^ HEAD >actual &&
+	test_cmp expect actual
+'
+
+test_done
diff --git a/userdiff.c b/userdiff.c
index df99249..67003fb 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -1,3 +1,4 @@
+#include "cache.h"
 #include "userdiff.h"
 #include "cache.h"
 #include "attr.h"
@@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v)
 	return 1;
 }
 
+static int parse_bool(int *b, const char *k, const char *v)
+{
+	*b = git_config_bool(k, v);
+	return 1;
+}
+
 int userdiff_config(const char *k, const char *v)
 {
 	struct userdiff_driver *drv;
@@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_string(&drv->external, k, v);
 	if ((drv = parse_driver(k, v, "textconv")))
 		return parse_string(&drv->textconv, k, v);
+	if ((drv = parse_driver(k, v, "cachetextconv")))
+		return parse_bool(&drv->textconv_want_cache, k, v);
 	if ((drv = parse_driver(k, v, "wordregex")))
 		return parse_string(&drv->word_regex, k, v);
 
diff --git a/userdiff.h b/userdiff.h
index c315159..942d594 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -1,6 +1,8 @@
 #ifndef USERDIFF_H
 #define USERDIFF_H
 
+#include "notes-cache.h"
+
 struct userdiff_funcname {
 	const char *pattern;
 	int cflags;
@@ -13,6 +15,8 @@ struct userdiff_driver {
 	struct userdiff_funcname funcname;
 	const char *word_regex;
 	const char *textconv;
+	struct notes_cache *textconv_cache;
+	int textconv_want_cache;
 };
 
 int userdiff_config(const char *k, const char *v);
-- 
1.7.0.4.299.gba9d4

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]