Re: [PATCH 4/6] introduce a commit metapack

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jan 29, 2013 at 04:16:11AM -0500, Jeff King wrote:
> When we are doing a commit traversal that does not need to
> look at the commit messages themselves (e.g., rev-list,
> merge-base, etc), we spend a lot of time accessing,
> decompressing, and parsing the commit objects just to find
> the parent and timestamp information. We can make a
> space-time tradeoff by caching that information on disk in a
> compact, uncompressed format.

And this is a (messy) patch on top that avoids storing SHA-1
directly. On my linux-2.6.git (575 MB pack, 73 MB index), .commits
file is 5.2 MB and 27 MB with and without my patch respectively. Nice
shrinkage.

However, performance seems to suffer too. Maybe I do more lookups than
necessary, I don't know. I should probably measure the cost of
revindex separately.

git rev-list --all --quiet on vanilla git:

real    0m3.645s
user    0m3.556s
sys     0m0.080s

commit cache without my patch:

real    0m0.723s
user    0m0.677s
sys     0m0.045s

and with my patch:

real    0m1.338s
user    0m1.259s
sys     0m0.075s


Another point, but not really important at this stage, I think we have
memory leak somewhere (lookup_commit??). It used up to 800 MB RES on
linux-2.6.git while generating the cache.

-- 8< --
diff --git a/cache.h b/cache.h
index 1f96f65..8048d5b 100644
--- a/cache.h
+++ b/cache.h
@@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int);
 extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern int find_pack_entry_pos(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
 extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/commit-metapack.c b/commit-metapack.c
index 2c19f48..55f7ea9 100644
--- a/commit-metapack.c
+++ b/commit-metapack.c
@@ -3,11 +3,12 @@
 #include "metapack.h"
 #include "commit.h"
 #include "sha1-lookup.h"
+#include "pack-revindex.h"
 
 struct commit_metapack {
 	struct metapack mp;
-	uint32_t nr;
-	unsigned char *index;
+	struct packed_git *pack;
+	uint32_t first, last;
 	unsigned char *data;
 	struct commit_metapack *next;
 };
@@ -16,7 +17,7 @@ static struct commit_metapack *commit_metapacks;
 static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
 {
 	struct commit_metapack *it = xcalloc(1, sizeof(*it));
-	uint32_t version;
+	uint32_t version, nr;
 
 	if (metapack_init(&it->mp, pack, "commits", &version) < 0) {
 		free(it);
@@ -39,22 +40,25 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
 		free(it);
 		return NULL;
 	}
-	memcpy(&it->nr, it->mp.data, 4);
-	it->nr = ntohl(it->nr);
+	memcpy(&it->first, it->mp.data, 4);
+	it->first = ntohl(it->first);
+	memcpy(&it->last, it->mp.data + 4, 4);
+	it->last = ntohl(it->last);
+	nr = it->last - it->first + 1;
+	it->pack = pack;
 
 	/*
-	 * We need 84 bytes for each entry: sha1(20), date(4), tree(20),
-	 * parents(40).
+	 * We need 16 bytes for each entry: date(4), tree index(4),
+	 * parent indexes(8).
 	 */
-	if (it->mp.len < (84 * it->nr + 4)) {
+	if (it->mp.len < (16 * nr + 8)) {
 		warning("commit metapack for '%s' is truncated", pack->pack_name);
 		metapack_close(&it->mp);
 		free(it);
 		return NULL;
 	}
 
-	it->index = it->mp.data + 4;
-	it->data = it->index + 20 * it->nr;
+	it->data = it->mp.data + 8;
 
 	return it;
 }
@@ -81,31 +85,61 @@ static void prepare_commit_metapacks(void)
 	initialized = 1;
 }
 
+static const unsigned char *idx_to_sha1(struct packed_git *p,
+					uint32_t nth)
+{
+	struct revindex_entry *revindex = get_revindex(p);
+	if (!revindex)
+		return NULL;
+	return nth_packed_object_sha1(p, revindex[nth].nr);
+}
+
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2)
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2)
 {
 	struct commit_metapack *p;
 
 	prepare_commit_metapacks();
 	for (p = commit_metapacks; p; p = p->next) {
 		unsigned char *data;
-		int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1);
-		if (pos < 0)
+		uint32_t p1, p2;
+		struct revindex_entry *re, *base;
+		off_t off;
+		uint32_t pos;
+
+		base = get_revindex(p->pack);
+		off = find_pack_entry_one(sha1, p->pack);
+		if (!off)
+			continue;
+		re = find_pack_revindex(p->pack, off);
+		if (!re)
+			continue;
+		pos = re - base;
+		if (pos < p->first || pos > p->last)
 			continue;
 
 		/* timestamp(4) + tree(20) + parents(40) */
-		data = p->data + 64 * pos;
+		data = p->data + 16 * (pos - p->first);
 		*timestamp = *(uint32_t *)data;
 		*timestamp = ntohl(*timestamp);
+		if (!*timestamp)
+			return -1;
 		data += 4;
-		*tree = data;
-		data += 20;
-		*parent1 = data;
-		data += 20;
-		*parent2 = data;
+		*tree = idx_to_sha1(p->pack, ntohl(*(uint32_t*)data));
+		data += 4;
+		p1 = ntohl(*(uint32_t*)data);
+		*parent1 = idx_to_sha1(p->pack, p1);
+		data += 4;
+		p2 = ntohl(*(uint32_t*)data);
+		if (p1 == p2)
+			*parent2 = null_sha1;
+		else
+			*parent2 = idx_to_sha1(p->pack, p2);
+		if (!*tree || !*parent1 || !*parent2)
+			return -1;
 
 		return 0;
 	}
@@ -113,63 +147,114 @@ int commit_metapack(unsigned char *sha1,
 	return -1;
 }
 
-static void get_commits(struct metapack_writer *mw,
-			const unsigned char *sha1,
-			void *data)
+static int get_commits(struct metapack_writer *mw,
+		       uint32_t nth,
+		       int dry_run)
 {
-	struct commit_list ***tail = data;
+	const unsigned char *sha1 = nth_packed_object_sha1(mw->pack, nth);
 	enum object_type type = sha1_object_info(sha1, NULL);
 	struct commit *c;
+	struct revindex_entry *revindex, *ridx;
+	int pt, p1, p2 = -1;
 
-	if (type != OBJ_COMMIT)
-		return;
+	if (type != OBJ_COMMIT) {
+		if (dry_run)
+			return -1;
+		metapack_writer_add_uint32(mw, 0); /* date */
+		metapack_writer_add_uint32(mw, 0); /* tree */
+		metapack_writer_add_uint32(mw, 0); /* 1st parent */
+		metapack_writer_add_uint32(mw, 0); /* 2nd tree */
+		return 0;
+	}
 
 	c = lookup_commit(sha1);
 	if (!c || parse_commit(c))
 		die("unable to read commit %s", sha1_to_hex(sha1));
 
+	if (c->buffer) {
+		free(c->buffer);
+		c->buffer = NULL;
+	}
+
 	/*
 	 * Our fixed-size parent list cannot represent root commits, nor
 	 * octopus merges. Just skip those commits, as we can fallback
 	 * in those rare cases to reading the actual commit object.
 	 */
 	if (!c->parents ||
-	    (c->parents && c->parents->next && c->parents->next->next))
-		return;
+	    (c->parents && c->parents->next && c->parents->next->next) ||
+	    /* edge commits are out too */
+	    (pt = find_pack_entry_pos(c->tree->object.sha1, mw->pack)) == -1 ||
+	    (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 ||
+	    (c->parents->next &&
+	     (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) ||
+	    /*
+	     * we set the 2nd parent the same as 1st parent as an
+	     * indication that 2nd parent does not exist. Normal
+	     * commits should never have two same parents, but just in
+	     * case..
+	     */
+	    p1 == p2 ||
+	    /* zero date is reserved to say this is not a valid entry */
+	    c->date == 0) {
+		if (dry_run)
+			return -1;
+		metapack_writer_add_uint32(mw, 0); /* date */
+		metapack_writer_add_uint32(mw, 0); /* tree */
+		metapack_writer_add_uint32(mw, 0); /* 1st parent */
+		metapack_writer_add_uint32(mw, 0); /* 2nd tree */
+		return 0;
+	}
+
+	if (dry_run)
+		return 0;
+
+	revindex = get_revindex(mw->pack);
 
-	*tail = &commit_list_insert(c, *tail)->next;
+	metapack_writer_add_uint32(mw, c->date);
+	ridx = find_pack_revindex(mw->pack,
+				  nth_packed_object_offset(mw->pack, pt));
+	metapack_writer_add_uint32(mw, ridx - revindex);
+	ridx = find_pack_revindex(mw->pack,
+				  nth_packed_object_offset(mw->pack, p1));
+	metapack_writer_add_uint32(mw, ridx - revindex);
+	if (p2 != -1)
+		ridx = find_pack_revindex(mw->pack,
+					  nth_packed_object_offset(mw->pack, p2));
+	metapack_writer_add_uint32(mw, ridx - revindex);
+	return 0;
 }
 
 void commit_metapack_write(const char *idx)
 {
 	struct metapack_writer mw;
-	struct commit_list *commits = NULL, *p;
-	struct commit_list **tail = &commits;
-	uint32_t nr = 0;
+	uint32_t i, first = 0xffffffff, last = 0;
+	struct revindex_entry *revidx;
 
 	metapack_writer_init(&mw, idx, "commits", 1);
 
-	/* Figure out how many eligible commits we've got in this pack. */
-	metapack_writer_foreach(&mw, get_commits, &tail);
-	for (p = commits; p; p = p->next)
-		nr++;
-	metapack_writer_add_uint32(&mw, nr);
+	packed_git = mw.pack;
+	revidx = get_revindex(mw.pack);
 
-	/* Then write an index of commit sha1s */
-	for (p = commits; p; p = p->next)
-		metapack_writer_add(&mw, p->item->object.sha1, 20);
+	/*
+	 * Figure out how many eligible commits we've got in this pack.
+	 */
+	for (i = 0; i < mw.pack->num_objects; i++) {
+		int ret = get_commits(&mw, revidx[i].nr, 1);
+		if (ret == -1)	/* not cached */
+			continue;
+		if (i < first)
+			first = i;
+		if (i > last)
+			last = i;
+	}
+
+	metapack_writer_add_uint32(&mw, first);
+	metapack_writer_add_uint32(&mw, last);
 
 	/* Followed by the actual date/tree/parents data */
-	for (p = commits; p; p = p->next) {
-		struct commit *c = p->item;
-		metapack_writer_add_uint32(&mw, c->date);
-		metapack_writer_add(&mw, c->tree->object.sha1, 20);
-		metapack_writer_add(&mw, c->parents->item->object.sha1, 20);
-		metapack_writer_add(&mw,
-				    c->parents->next ?
-				    c->parents->next->item->object.sha1 :
-				    null_sha1, 20);
-	}
+	for (i = first; i <= last; i++)
+		get_commits(&mw, revidx[i].nr, 0);
 
 	metapack_writer_finish(&mw);
 }
diff --git a/commit-metapack.h b/commit-metapack.h
index 4684573..caf85be 100644
--- a/commit-metapack.h
+++ b/commit-metapack.h
@@ -3,9 +3,9 @@
 
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2);
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2);
 
 void commit_metapack_write(const char *idx_file);
 
diff --git a/pack-revindex.c b/pack-revindex.c
index 77a0465..d58dd02 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -111,12 +111,10 @@ static void create_pack_revindex(struct pack_revindex *rix)
 	qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
 }
 
-struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
+struct revindex_entry *get_revindex(struct packed_git *p)
 {
 	int num;
-	int lo, hi;
 	struct pack_revindex *rix;
-	struct revindex_entry *revindex;
 
 	if (!pack_revindex_hashsz)
 		init_pack_revindex();
@@ -127,7 +125,13 @@ struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
 	rix = &pack_revindex[num];
 	if (!rix->revindex)
 		create_pack_revindex(rix);
-	revindex = rix->revindex;
+	return rix->revindex;
+}
+
+struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
+{
+	int lo, hi;
+	struct revindex_entry *revindex = get_revindex(p);
 
 	lo = 0;
 	hi = p->num_objects + 1;
diff --git a/pack-revindex.h b/pack-revindex.h
index 8d5027a..cea85db 100644
--- a/pack-revindex.h
+++ b/pack-revindex.h
@@ -6,6 +6,7 @@ struct revindex_entry {
 	unsigned int nr;
 };
 
+struct revindex_entry *get_revindex(struct packed_git *p);
 struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs);
 void discard_revindex(void);
 
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1acab8c 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
 	}
 }
 
-off_t find_pack_entry_one(const unsigned char *sha1,
-				  struct packed_git *p)
+int find_pack_entry_pos(const unsigned char *sha1,
+			struct packed_git *p)
 {
 	const uint32_t *level1_ofs = p->index_data;
 	const unsigned char *index = p->index_data;
@@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 
 	if (!index) {
 		if (open_pack_index(p))
-			return 0;
+			return -1;
 		level1_ofs = p->index_data;
 		index = p->index_data;
 	}
@@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 	if (use_lookup) {
 		int pos = sha1_entry_pos(index, stride, 0,
 					 lo, hi, p->num_objects, sha1);
-		if (pos < 0)
-			return 0;
-		return nth_packed_object_offset(p, pos);
+		return pos;
 	}
 
 	do {
@@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 			printf("lo %u hi %u rg %u mi %u\n",
 			       lo, hi, hi - lo, mi);
 		if (!cmp)
-			return nth_packed_object_offset(p, mi);
+			return mi;
 		if (cmp > 0)
 			hi = mi;
 		else
 			lo = mi+1;
 	} while (lo < hi);
-	return 0;
+	return -1;
 }
 
+off_t find_pack_entry_one(const unsigned char *sha1,
+				  struct packed_git *p)
+{
+	int pos = find_pack_entry_pos(sha1, p);
+	if (pos < 0)
+		return 0;
+	else
+		return nth_packed_object_offset(p, pos);
+}
 int is_pack_valid(struct packed_git *p)
 {
 	/* An already open pack is known to be valid. */
-- 
1.8.1.1.459.g5970e58

-- 8< --
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]