[PATCH 1/1] ls-refs.c: minimize number of refs visited

Jacob Vosmaer <jacob@xxxxxxxxxx> · Tue, 19 Jan 2021 15:42:51 +0100

The previous implementation of ls-refs would perform exactly one ref
walk, matching each ref against the prefixes (if any) provided by the
user. This can be expensive if there are a lot of refs and the user
only cares about a small subset of them.

In this patch we analyze the prefixes provided by the user and build a
minimal set of disjoint prefixes that contains all of them. We then do
a ref walk for each of these minimal prefixes.

It is tempting to have just one strvec for the prefixes and use it
both for matching and for iterating. But every time I tried that, it
made things more complicated. I settled on leaving the existing ref
matching (using &data.prefixes) alone, and I added a new layer around
it for the ref walk optimization (using &iter_prefixes).

This commit also fixes a bug in ls-refs.c that was not triggered
before: we were using a strvec set to zero, which is not how you are
supposed to initialize a strvec. We now call strvec_init after zeroing.

Signed-off-by: Jacob Vosmaer <jacob@xxxxxxxxxx>
---
 ls-refs.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/ls-refs.c b/ls-refs.c
index a1e0b473e4..6d5f0c769a 100644
--- a/ls-refs.c
+++ b/ls-refs.c
@@ -84,12 +84,44 @@ static int ls_refs_config(const char *var, const char *value, void *data)
 	return parse_hide_refs_config(var, value, "uploadpack");
 }
 
+static int cmp_prefix(const void *a_, const void *b_){
+	const char *a = *(const char **)a_;
+	const char *b = *(const char **)b_;
+	return strcmp(a, b);
+}
+
+static void deduplicate_prefixes(struct strvec *prefixes) {
+	int i;
+
+	QSORT(prefixes->v, prefixes->nr, cmp_prefix);
+
+	for (i = 1; i < prefixes->nr;) {
+		const char *p = prefixes->v[i];
+
+		/*
+		 * If p is "refs/foobar" and its predecessor is "refs/foo" then we should
+		 * drop p, both to avoid sending duplicate refs to the user, and to avoid
+		 * doing unnecessary work.
+		 */
+		if (starts_with(p, prefixes->v[i - 1])) {
+			MOVE_ARRAY(&prefixes->v[i], &prefixes->v[i + 1], prefixes->nr - (i + 1));
+			prefixes->v[prefixes->nr - 1] = p;
+			strvec_pop(prefixes);
+		} else {
+			i++;
+		}
+	}
+}
+
 int ls_refs(struct repository *r, struct strvec *keys,
 	    struct packet_reader *request)
 {
 	struct ls_refs_data data;
+	struct strvec iter_prefixes = STRVEC_INIT;
+	const char **p;
 
 	memset(&data, 0, sizeof(data));
+	strvec_init(&data.prefixes);
 
 	git_config(ls_refs_config, NULL);
 
@@ -109,8 +141,37 @@ int ls_refs(struct repository *r, struct strvec *keys,
 		die(_("expected flush after ls-refs arguments"));
 
 	head_ref_namespaced(send_ref, &data);
-	for_each_namespaced_ref(send_ref, &data);
+
+	/*
+	 * Try to create a minimal disjoint set of prefixes that
+	 * contains everything the user wants to see, but that lets us
+	 * avoid a full ref walk. If the repo has a lot of refs and
+	 * the user only wants to see refs/heads and refs/tags,
+	 * it is faster to do two small walks of refs/heads and
+	 * refs/tags instead of one big walk of all of refs/.
+	 */
+	for (p = data.prefixes.v; *p; p++) {
+		if (starts_with(*p, "refs/")) {
+			strvec_push(&iter_prefixes, *p);
+		} else { /* full ref walk in pathological cases like "" */
+			strvec_push(&iter_prefixes, "refs/");
+		}
+	}
+
+	if (!iter_prefixes.nr) /* full ref walk when there are no prefixes */
+		strvec_push(&iter_prefixes, "refs/");
+
+	deduplicate_prefixes(&iter_prefixes);
+
+	for (p = iter_prefixes.v; *p; p++) {
+		struct strbuf buf = STRBUF_INIT;
+		strbuf_addf(&buf, "%s%s", get_git_namespace(), *p);
+		for_each_fullref_in(buf.buf, send_ref, &data, 0);
+		strbuf_release(&buf);
+	}
+
 	packet_flush(1);
 	strvec_clear(&data.prefixes);
+	strvec_clear(&iter_prefixes);
 	return 0;
 }
-- 
2.30.0