Re: [PATCH 2/2] --pretty=format: on-demand format expansion

Jeff King <peff@xxxxxxxx> · Thu, 8 Nov 2007 23:50:40 -0500

On Fri, Nov 09, 2007 at 01:49:42AM +0100, René Scharfe wrote:

> Another way is to use a callback based approach together with the
> strbuf library to keep allocations to a minimum and avoid string
> copies.  That's what this patch does.  It introduces a new strbuf
> function, strbuf_expand().

I think this is definitely the right approach, but there are some format
strings where the performance will be worse. Specifically:

  - formatting expensive items multiple times will incur work
    proportional to the number of times the item is used (in the old
    code, it was calculated just once).  e.g., "%h%h%h%h"

  - formatting some items goes to some work that can be re-used by other
    items (e.g., %ad and %ar both need to parse the author date)

And we could obviously overcome both by caching the results of expensive
operations.  I'm not sure if these will be a problem in practice. For
the first one, the new code is so much faster that I needed to do

  git-log --pretty=format:%h%h%h%h%h%h%h%h

to get a performance regression from the old code, which seems rather
unlikely. For the second, it is easy to imagine multiple "person" items
being used together, although their cost to produce is not all that
high. It looks like about .05 seconds to parse a date (over all commits
in git.git):

$ time ./git-log --pretty='format:' >/dev/null
real    0m0.441s
user    0m0.424s
sys     0m0.004s

$ time ./git-log --pretty='format:%ad' >/dev/null
real    0m0.477s
user    0m0.472s
sys     0m0.000s

$ time ./git-log --pretty='format:%ad %aD' >/dev/null
real    0m0.527s
user    0m0.520s
sys     0m0.004s

where the last two could probably end up costing about the same if we cached
the author parsing (but the caching will have a cost, too, so it might not end
up being a big win).

So it might make sense to cache some items as we figure them out. This
should be done by the calling code and not by strbuf_expand (since it
doesn't know which things are worth caching (and for fast things,
allocating memory for a cache entry is likely to be slower), or how the
expansions relate to each other).

A partial patch on top of yours is below (it caches commit and tree
abbreviations; parent abbreviations and person-parsing are probably
worth doing). Some timings:

Null format (these are average-looking runs; the differences got lost
in the noise):

# your patch
$ time git-log --pretty=format: >/dev/null
real    0m0.409s
user    0m0.384s
sys     0m0.012s

# with my patch
$ time ./git-log --pretty=format: >/dev/null
real    0m0.413s
user    0m0.404s
sys     0m0.004s

Single abbrev lookup (mine should be slightly slower because of
malloc/free of cache):

# your patch
$ time git-log --pretty=format:%h >/dev/null
real    0m0.536s
user    0m0.456s
sys     0m0.080s

# with my patch
$ time ./git-log --pretty=format:%h >/dev/null
real    0m0.553s
user    0m0.464s
sys     0m0.088s

Two abbrev lookups (I win by a little bit, but definitely not lost in
the noise):

# your patch
$ time git-log --pretty=format:%h%h >/dev/null
real    0m0.671s
user    0m0.496s
sys     0m0.144s

# my patch
$ time ./git-log  --pretty=format:%h%h >/dev/null
real    0m0.567s
user    0m0.480s
sys     0m0.080s

And of course I can make pathological cases where mine wins hands down
(on "%h%h%h%h%h%h%h%h", mine stays the same but yours bumps to 1.2s).

So I think this is probably worth doing. Even if doubled work isn't the
common case,
  1. It doesn't hurt the common case much at all (I think on average it
     is slower, but the timings were totally lost in the noise)
  2. It has a measurable impact on reasonable cases (like just using an
     expensive substitution twice)
  3. It has a huge impact on pathological cases (though I'm not sure we
     care about those that much)
  4. It's very little extra code, and it should be obvious to read. It
     also documents the technique for other users of strbuf_expand,
     where the "doubled" cases may be more common.

-Peff

---
 pretty.c |   47 ++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/pretty.c b/pretty.c
index 9fbd73f..8ae6fdd 100644
--- a/pretty.c
+++ b/pretty.c
@@ -282,6 +282,27 @@ static char *logmsg_reencode(const struct commit *commit,
 	return out;
 }
 
+struct pretty_context {
+	const struct commit *commit;
+	char *abbrev_commit;
+	char *abbrev_tree;
+};
+
+static void pretty_context_init(struct pretty_context *p, const struct commit *c)
+{
+	p->commit = c;
+	p->abbrev_commit = NULL;
+	p->abbrev_tree = NULL;
+}
+
+static void pretty_context_free(struct pretty_context *p)
+{
+	if (p->abbrev_commit)
+		free(p->abbrev_commit);
+	if (p->abbrev_tree)
+		free(p->abbrev_tree);
+}
+
 static void format_person_part(struct strbuf *sb, char part,
                                const char *msg, int len)
 {
@@ -355,9 +376,10 @@ static void format_person_part(struct strbuf *sb, char part,
 }
 
 static void format_commit_item(struct strbuf *sb, const char *placeholder,
-                               void *context)
+                               void *vcontext)
 {
-	const struct commit *commit = context;
+	struct pretty_context *context = vcontext;
+	const struct commit *commit = context->commit;
 	struct commit_list *p;
 	int i;
 	enum { HEADER, SUBJECT, BODY } state;
@@ -394,15 +416,23 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
 		strbuf_addstr(sb, sha1_to_hex(commit->object.sha1));
 		return;
 	case 'h':		/* abbreviated commit hash */
-		strbuf_addstr(sb, find_unique_abbrev(commit->object.sha1,
-		                                     DEFAULT_ABBREV));
+		if (!context->abbrev_commit)
+			context->abbrev_commit = xstrdup(
+					find_unique_abbrev(
+						commit->object.sha1,
+						DEFAULT_ABBREV));
+		strbuf_addstr(sb, context->abbrev_commit);
 		return;
 	case 'T':		/* tree hash */
 		strbuf_addstr(sb, sha1_to_hex(commit->tree->object.sha1));
 		return;
 	case 't':		/* abbreviated tree hash */
-		strbuf_addstr(sb, find_unique_abbrev(commit->tree->object.sha1,
-		                                     DEFAULT_ABBREV));
+		if (!context->abbrev_tree)
+			context->abbrev_tree = xstrdup(
+					find_unique_abbrev(
+					commit->tree->object.sha1,
+					DEFAULT_ABBREV));
+		strbuf_addstr(sb, context->abbrev_tree);
 		return;
 	case 'P':		/* parent hashes */
 		for (p = commit->parents; p; p = p->next) {
@@ -505,7 +535,10 @@ void format_commit_message(const struct commit *commit,
 		"m",		/* left/right/bottom */
 		NULL
 	};
-	strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
+	struct pretty_context context;
+	pretty_context_init(&context, commit);
+	strbuf_expand(sb, format, placeholders, format_commit_item, &context);
+	pretty_context_free(&context);
 }
 
 static void pp_header(enum cmit_fmt fmt,
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html