Re: [PATCH 0/6] address packed-refs speed regressions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Apr 05, 2015 at 02:52:59PM -0400, Jeff King wrote:

> Right now we parse all of the packed-refs file into an in-memory cache,
> and then do single lookups from that cache. Doing an mmap() and a binary
> search is way faster (and costs less memory) for doing individual
> lookups. It relies on the list being sorted. This is generally true, but
> not something we currently rely on (however, it would be easy to add a
> "sorted" flag to top of the file and have the readers fall back when the
> flag is missing). I've played with a patch to do this (it's not entirely
> trivial, because you jump into the middle of a line, and then have to
> walk backwards to find the start of the record).
> 
> For traversals, it's more complicated. Obviously if you are traversing
> all refs, you have to read the whole thing anyway. If you are traversing
> a subset of the refs, you can binary-search the start of the subset, and
> then walk forward. But that's where it gets tricky with the current
> code.

In case you are curious, here is my proof-of-concept for the packed-refs
binary search. You'll note that it's a separate program, and not
integrated into refs.c. I wrote this last August, and after trying to
integrate it into refs.c, I found the ref_cache problems I described,
and I haven't touched it since.

I also seem to have saved the patch for stuffing it into refs.c, but I
am not sure if it even compiles (I wrote only "horrible wip" in the
commit message ;) ).

-- >8 --
Subject: [PATCH] add git-quick-list

This is a proof of concept for binary-searching the
packed-refs file in order to traverse an ordered subset of
it. Note that it _only_ reads the packed-refs file
currently. To really compare to for-each-ref, it would need
to also walk the loose ref area for its prefix. On a
mostly-packed repository that shouldn't make a big speed
difference, though.

And of course we don't _really_ want a separate command here
at all. This should be part of refs.c, and everyone who
calls for_each_ref should benefit from it.

Still, the numbers are promising. Here's are comparisons
against for-each-ref on torvalds/linux, which has a 218M
packed-refs file:

  $ time git for-each-ref \
      --format='%(objectname) %(refname)' \
      refs/remotes/2325298/ |
      wc -c
  44139

  real    0m1.649s
  user    0m1.332s
  sys     0m0.304s

  $ time ~peff/git-quick-list refs/remotes/2325298/ | wc -c
  44139

  real    0m0.012s
  user    0m0.004s
  sys     0m0.004s
---
 Makefile     |   1 +
 quick-list.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 quick-list.c

diff --git a/Makefile b/Makefile
index 2457065..aa32598 100644
--- a/Makefile
+++ b/Makefile
@@ -541,6 +541,7 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += quick-list.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
diff --git a/quick-list.c b/quick-list.c
new file mode 100644
index 0000000..e423f1f
--- /dev/null
+++ b/quick-list.c
@@ -0,0 +1,174 @@
+#include "cache.h"
+#include "refs.h"
+
+struct packed_refs_iterator {
+	const char *start;
+	const char *end;
+
+	const char *cur;
+	const char *ref;
+	const char *eol;
+	const char *next;
+};
+
+static void iterator_init(struct packed_refs_iterator *pos,
+			  const char *buf, size_t len)
+{
+	pos->start = buf;
+	pos->end = buf + len;
+
+	/* skip past header line */
+	if (pos->start < pos->end && *pos->start == '#') {
+		while (pos->start < pos->end && *pos->start != '\n')
+			pos->start++;
+		if (pos->start < pos->end)
+			pos->start++;
+	}
+}
+
+static int iterator_cmp(const char *key, struct packed_refs_iterator *pos)
+{
+	const char *ref = pos->ref;
+	for (; *key && ref < pos->eol; key++, ref++)
+		if (*key != *ref)
+			return (unsigned char)*key - (unsigned char)*ref;
+	return ref == pos->eol ? *key ? 1 : 0 : -1;
+}
+
+static const char *find_eol(const char *p, const char *end)
+{
+	p = memchr(p, '\n', end - p);
+	return p ? p : end;
+}
+
+static void parse_line(struct packed_refs_iterator *pos, const char *p)
+{
+	pos->cur = p;
+	if (pos->end - p < 41)
+		die("truncated packed-refs file");
+	p += 41;
+
+	pos->ref = p;
+	pos->eol = p = find_eol(p, pos->end);
+
+	/* skip newline, and then past any peel records */
+	if (p < pos->end)
+		p++;
+	while (p < pos->end && *p == '^') {
+		p = find_eol(p, pos->end);
+		if (p < pos->end)
+			p++;
+	}
+	pos->next = p;
+}
+
+static void iterator_next(struct packed_refs_iterator *pos)
+{
+	if (pos->next < pos->end)
+		parse_line(pos, pos->next);
+	else
+		pos->cur = NULL;
+}
+
+static void iterator_start(struct packed_refs_iterator *pos, const char *prefix)
+{
+	const char *lo = pos->start, *hi = pos->end;
+
+	while (lo < hi) {
+		const char *mi = lo + ((hi - lo) / 2);
+		int cmp;
+
+		/*
+		 * We landed somewhere on a line. Walk back to find
+		 * the start of the line.
+		 */
+		while (mi > lo && *(mi-1) != '\n')
+			mi--;
+
+		/*
+		 * We may have hit a peel-line. In that case, try
+		 * to walk back to the actual ref line (and skip as
+		 * many peel lines as we find, for future-proofing).
+		 */
+		while (*mi == '^') {
+			if (mi == lo)
+				die("peel line without a record before it?");
+			mi--;
+			if (mi == lo)
+				die("peel line with bare newline before it?");
+			mi--;
+			while (mi > lo && *(mi-1) != '\n')
+				mi--;
+		}
+
+		/* Now we should be at a real ref line. */
+		parse_line(pos, mi);
+		cmp = iterator_cmp(prefix, pos);
+		if (!cmp)
+			return;
+		else if (cmp < 0)
+			hi = pos->cur;
+		else
+			lo = pos->next;
+	}
+
+	if (hi < pos->end)
+		parse_line(pos, hi);
+	else
+		pos->cur = NULL;
+}
+
+static void quick_list(const char *prefix, each_ref_fn fn, void *data)
+{
+	int fd = open(git_path("packed-refs"), O_RDONLY);
+	struct stat st;
+	const char *buf = NULL;
+	size_t len;
+	struct packed_refs_iterator pos;
+
+	if (fd < 0)
+		goto out;
+	if (fstat(fd, &st) < 0)
+		goto out;
+	len = xsize_t(st.st_size);
+	buf = xmmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (!buf)
+		goto out;
+
+	iterator_init(&pos, buf, len);
+	for (iterator_start(&pos, prefix);
+	     pos.cur && starts_with(pos.ref, prefix);
+	     iterator_next(&pos)) {
+		unsigned char sha1[20];
+		char *refname;
+
+		if (get_sha1_hex(pos.cur, sha1) < 0)
+			die("packed-refs contained invalid sha1");
+		refname = xmemdupz(pos.ref, pos.eol - pos.ref);
+		fn(refname, sha1, 0, data);
+		free(refname);
+	}
+
+out:
+	close(fd);
+	if (buf)
+		munmap((void *)buf, len);
+}
+
+static int show_ref(const char *refname, const unsigned char *sha1,
+		    int flags, void *data)
+{
+	printf("%s %s\n", sha1_to_hex(sha1), refname);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	if (argc != 2)
+		usage("git quick-list <prefix>");
+
+	setup_git_directory();
+	quick_list(argv[1], show_ref, NULL);
+
+	return 0;
+}
-- 
2.4.0.rc0.363.gf9f328b

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]