On Sun, Apr 05, 2015 at 02:52:59PM -0400, Jeff King wrote: > Right now we parse all of the packed-refs file into an in-memory cache, > and then do single lookups from that cache. Doing an mmap() and a binary > search is way faster (and costs less memory) for doing individual > lookups. It relies on the list being sorted. This is generally true, but > not something we currently rely on (however, it would be easy to add a > "sorted" flag to top of the file and have the readers fall back when the > flag is missing). I've played with a patch to do this (it's not entirely > trivial, because you jump into the middle of a line, and then have to > walk backwards to find the start of the record). > > For traversals, it's more complicated. Obviously if you are traversing > all refs, you have to read the whole thing anyway. If you are traversing > a subset of the refs, you can binary-search the start of the subset, and > then walk forward. But that's where it gets tricky with the current > code. In case you are curious, here is my proof-of-concept for the packed-refs binary search. You'll note that it's a separate program, and not integrated into refs.c. I wrote this last August, and after trying to integrate it into refs.c, I found the ref_cache problems I described, and I haven't touched it since. I also seem to have saved the patch for stuffing it into refs.c, but I am not sure if it even compiles (I wrote only "horrible wip" in the commit message ;) ). -- >8 -- Subject: [PATCH] add git-quick-list This is a proof of concept for binary-searching the packed-refs file in order to traverse an ordered subset of it. Note that it _only_ reads the packed-refs file currently. To really compare to for-each-ref, it would need to also walk the loose ref area for its prefix. On a mostly-packed repository that shouldn't make a big speed difference, though. And of course we don't _really_ want a separate command here at all. This should be part of refs.c, and everyone who calls for_each_ref should benefit from it. Still, the numbers are promising. Here's are comparisons against for-each-ref on torvalds/linux, which has a 218M packed-refs file: $ time git for-each-ref \ --format='%(objectname) %(refname)' \ refs/remotes/2325298/ | wc -c 44139 real 0m1.649s user 0m1.332s sys 0m0.304s $ time ~peff/git-quick-list refs/remotes/2325298/ | wc -c 44139 real 0m0.012s user 0m0.004s sys 0m0.004s --- Makefile | 1 + quick-list.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 quick-list.c diff --git a/Makefile b/Makefile index 2457065..aa32598 100644 --- a/Makefile +++ b/Makefile @@ -541,6 +541,7 @@ PROGRAM_OBJS += shell.o PROGRAM_OBJS += show-index.o PROGRAM_OBJS += upload-pack.o PROGRAM_OBJS += remote-testsvn.o +PROGRAM_OBJS += quick-list.o # Binary suffix, set to .exe for Windows builds X = diff --git a/quick-list.c b/quick-list.c new file mode 100644 index 0000000..e423f1f --- /dev/null +++ b/quick-list.c @@ -0,0 +1,174 @@ +#include "cache.h" +#include "refs.h" + +struct packed_refs_iterator { + const char *start; + const char *end; + + const char *cur; + const char *ref; + const char *eol; + const char *next; +}; + +static void iterator_init(struct packed_refs_iterator *pos, + const char *buf, size_t len) +{ + pos->start = buf; + pos->end = buf + len; + + /* skip past header line */ + if (pos->start < pos->end && *pos->start == '#') { + while (pos->start < pos->end && *pos->start != '\n') + pos->start++; + if (pos->start < pos->end) + pos->start++; + } +} + +static int iterator_cmp(const char *key, struct packed_refs_iterator *pos) +{ + const char *ref = pos->ref; + for (; *key && ref < pos->eol; key++, ref++) + if (*key != *ref) + return (unsigned char)*key - (unsigned char)*ref; + return ref == pos->eol ? *key ? 1 : 0 : -1; +} + +static const char *find_eol(const char *p, const char *end) +{ + p = memchr(p, '\n', end - p); + return p ? p : end; +} + +static void parse_line(struct packed_refs_iterator *pos, const char *p) +{ + pos->cur = p; + if (pos->end - p < 41) + die("truncated packed-refs file"); + p += 41; + + pos->ref = p; + pos->eol = p = find_eol(p, pos->end); + + /* skip newline, and then past any peel records */ + if (p < pos->end) + p++; + while (p < pos->end && *p == '^') { + p = find_eol(p, pos->end); + if (p < pos->end) + p++; + } + pos->next = p; +} + +static void iterator_next(struct packed_refs_iterator *pos) +{ + if (pos->next < pos->end) + parse_line(pos, pos->next); + else + pos->cur = NULL; +} + +static void iterator_start(struct packed_refs_iterator *pos, const char *prefix) +{ + const char *lo = pos->start, *hi = pos->end; + + while (lo < hi) { + const char *mi = lo + ((hi - lo) / 2); + int cmp; + + /* + * We landed somewhere on a line. Walk back to find + * the start of the line. + */ + while (mi > lo && *(mi-1) != '\n') + mi--; + + /* + * We may have hit a peel-line. In that case, try + * to walk back to the actual ref line (and skip as + * many peel lines as we find, for future-proofing). + */ + while (*mi == '^') { + if (mi == lo) + die("peel line without a record before it?"); + mi--; + if (mi == lo) + die("peel line with bare newline before it?"); + mi--; + while (mi > lo && *(mi-1) != '\n') + mi--; + } + + /* Now we should be at a real ref line. */ + parse_line(pos, mi); + cmp = iterator_cmp(prefix, pos); + if (!cmp) + return; + else if (cmp < 0) + hi = pos->cur; + else + lo = pos->next; + } + + if (hi < pos->end) + parse_line(pos, hi); + else + pos->cur = NULL; +} + +static void quick_list(const char *prefix, each_ref_fn fn, void *data) +{ + int fd = open(git_path("packed-refs"), O_RDONLY); + struct stat st; + const char *buf = NULL; + size_t len; + struct packed_refs_iterator pos; + + if (fd < 0) + goto out; + if (fstat(fd, &st) < 0) + goto out; + len = xsize_t(st.st_size); + buf = xmmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0); + if (!buf) + goto out; + + iterator_init(&pos, buf, len); + for (iterator_start(&pos, prefix); + pos.cur && starts_with(pos.ref, prefix); + iterator_next(&pos)) { + unsigned char sha1[20]; + char *refname; + + if (get_sha1_hex(pos.cur, sha1) < 0) + die("packed-refs contained invalid sha1"); + refname = xmemdupz(pos.ref, pos.eol - pos.ref); + fn(refname, sha1, 0, data); + free(refname); + } + +out: + close(fd); + if (buf) + munmap((void *)buf, len); +} + +static int show_ref(const char *refname, const unsigned char *sha1, + int flags, void *data) +{ + printf("%s %s\n", sha1_to_hex(sha1), refname); + return 0; +} + +int main(int argc, char **argv) +{ + if (argc != 2) + usage("git quick-list <prefix>"); + + setup_git_directory(); + quick_list(argv[1], show_ref, NULL); + + return 0; +} -- 2.4.0.rc0.363.gf9f328b -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html