Re: [PATCH 4/6] introduce a commit metapack

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jan 30, 2013 at 09:16:29PM +0700, Duy Nguyen wrote:
> Perhaps we could store abbrev sha-1 instead of full sha-1. Nice
> space/time trade-off.

Following the on-disk format experiment yesterday, I changed the
format to:

 - a list a _short_ SHA-1 of cached commits
 - a list of cache entries, each (5 uint32_t) consists of:
   - uint32_t for the index in .idx sha-1 table to get full SHA-1 of
     the commit
   - uint32_t for timestamp
   - uint32_t for tree, 1st and 2nd parents for the index in .idx
     table

The length of SHA-1 is chosen to be able to unambiguously identify any
cached commits. Full SHA-1 check is done after to catch false
positives. For linux-2.6, SHA-1 length is 6 bytes, git and many
moderate-sized projects are 4 bytes. So it's 26 bytes per commit for
linux-2.6.git, or 8MB .commits file. Not as good as revindex approach
(5.5MB) but way better than the current one (27MB). And still good
enough for caching 2 parents unconditionally.

Performance seems to improve a tiny bit, probably because of more
compact search space: 0.6s with my patch vs 0.7s without (vs 3.4s
without cache).

-- 8< --
diff --git a/cache.h b/cache.h
index 1f96f65..8048d5b 100644
--- a/cache.h
+++ b/cache.h
@@ -1069,6 +1069,7 @@ extern struct packed_git *add_packed_git(const char *, int, int);
 extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern int find_pack_entry_pos(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
 extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/commit-metapack.c b/commit-metapack.c
index 2c19f48..c984b8e 100644
--- a/commit-metapack.c
+++ b/commit-metapack.c
@@ -4,11 +4,21 @@
 #include "commit.h"
 #include "sha1-lookup.h"
 
+struct commit_entry {
+	uint32_t commit;	/* nth_packed_object_sha1 to get own SHA-1 */
+	uint32_t timestamp;
+	uint32_t tree;		/* nth_packed_object_sha1 to get tree SHA-1 */
+	uint32_t parent1; /* nth_packed_object_sha1 to get 1st parent SHA-1 */
+	uint32_t parent2; /* nth_packed_object_sha1 to get 2nd parent SHA-1 */
+};
+
 struct commit_metapack {
 	struct metapack mp;
 	uint32_t nr;
+	uint32_t abbrev_len;
+	struct packed_git *pack;
 	unsigned char *index;
-	unsigned char *data;
+	struct commit_entry *data;
 	struct commit_metapack *next;
 };
 static struct commit_metapack *commit_metapacks;
@@ -41,20 +51,23 @@ static struct commit_metapack *alloc_commit_metapack(struct packed_git *pack)
 	}
 	memcpy(&it->nr, it->mp.data, 4);
 	it->nr = ntohl(it->nr);
+	memcpy(&it->abbrev_len, it->mp.data + 4, 4);
+	it->abbrev_len = ntohl(it->abbrev_len);
+	it->pack = pack;
 
 	/*
-	 * We need 84 bytes for each entry: sha1(20), date(4), tree(20),
-	 * parents(40).
+	 * We need 20+abbrev_len bytes for each entry: abbrev sha-1,
+	 * date(4), tree index(4), parent indexes(8).
 	 */
-	if (it->mp.len < (84 * it->nr + 4)) {
+	if (it->mp.len < ((sizeof(*it->data) + it->abbrev_len) * it->nr + 8)) {
 		warning("commit metapack for '%s' is truncated", pack->pack_name);
 		metapack_close(&it->mp);
 		free(it);
 		return NULL;
 	}
 
-	it->index = it->mp.data + 4;
-	it->data = it->index + 20 * it->nr;
+	it->index = it->mp.data + 8;
+	it->data = (struct commit_entry*)(it->index + it->abbrev_len * it->nr);
 
 	return it;
 }
@@ -83,29 +96,51 @@ static void prepare_commit_metapacks(void)
 
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2)
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2)
 {
 	struct commit_metapack *p;
 
 	prepare_commit_metapacks();
 	for (p = commit_metapacks; p; p = p->next) {
-		unsigned char *data;
-		int pos = sha1_entry_pos(p->index, 20, 0, 0, p->nr, p->nr, sha1);
+		struct commit_entry *data;
+		uint32_t p1, p2;
+		unsigned lo, hi, mi;
+		int pos;
+
+		/* sha1_entry_pos does not work with abbreviated sha-1 */
+		lo = 0;
+		hi = p->nr;
+		pos = -1;
+		do {
+			unsigned mi = (lo + hi) / 2;
+			int cmp = memcmp(p->index + mi * p->abbrev_len, sha1, p->abbrev_len);
+
+			if (!cmp) {
+				pos = mi;
+				break;
+			}
+			if (cmp > 0)
+				hi = mi;
+			else
+				lo = mi+1;
+		} while (lo < hi);
 		if (pos < 0)
 			continue;
 
-		/* timestamp(4) + tree(20) + parents(40) */
-		data = p->data + 64 * pos;
-		*timestamp = *(uint32_t *)data;
-		*timestamp = ntohl(*timestamp);
-		data += 4;
-		*tree = data;
-		data += 20;
-		*parent1 = data;
-		data += 20;
-		*parent2 = data;
+		data = p->data + pos;
+
+		/* full sha-1 check again */
+		if (hashcmp(nth_packed_object_sha1(p->pack, ntohl(data->commit)), sha1))
+			continue;
+
+		*timestamp = ntohl(data->timestamp);
+		*tree = nth_packed_object_sha1(p->pack, ntohl(data->tree));
+		p1 = ntohl(data->parent1);
+		*parent1 = nth_packed_object_sha1(p->pack, p1);
+		p2 = ntohl(data->parent2);
+		*parent2 = p1 == p2 ? null_sha1 : nth_packed_object_sha1(p->pack, p2);
 
 		return 0;
 	}
@@ -113,13 +148,20 @@ int commit_metapack(unsigned char *sha1,
 	return -1;
 }
 
+struct write_cb {
+	struct commit_list **tail;
+	int abbrev_len;
+	const unsigned char *last_sha1;
+};
+
 static void get_commits(struct metapack_writer *mw,
 			const unsigned char *sha1,
 			void *data)
 {
-	struct commit_list ***tail = data;
+	struct write_cb *write_cb = (struct write_cb *)data;
 	enum object_type type = sha1_object_info(sha1, NULL);
 	struct commit *c;
+	int p1, p2;
 
 	if (type != OBJ_COMMIT)
 		return;
@@ -128,47 +170,95 @@ static void get_commits(struct metapack_writer *mw,
 	if (!c || parse_commit(c))
 		die("unable to read commit %s", sha1_to_hex(sha1));
 
+	if (c->buffer) {
+		free(c->buffer);
+		c->buffer = NULL;
+	}
+
 	/*
 	 * Our fixed-size parent list cannot represent root commits, nor
 	 * octopus merges. Just skip those commits, as we can fallback
 	 * in those rare cases to reading the actual commit object.
 	 */
 	if (!c->parents ||
-	    (c->parents && c->parents->next && c->parents->next->next))
+	    (c->parents && c->parents->next && c->parents->next->next) ||
+	    /* edge commits are out too */
+	    find_pack_entry_pos(c->tree->object.sha1, mw->pack) == -1 ||
+	    (p1 = find_pack_entry_pos(c->parents->item->object.sha1, mw->pack)) == -1 ||
+	    (c->parents->next &&
+	     (p2 = find_pack_entry_pos(c->parents->next->item->object.sha1, mw->pack)) == -1) ||
+	    /*
+	     * we set the 2nd parent the same as 1st parent as an
+	     * indication that 2nd parent does not exist. Normal
+	     * commits should never have two same parents, but just in
+	     * case..
+	     */
+	    p1 == p2)
 		return;
 
-	*tail = &commit_list_insert(c, *tail)->next;
+	/*
+	 * Make sure we store the abbr sha-1 long enough to
+	 * unambiguously identify any cached commits in the pack.
+	 */
+	while (write_cb->abbrev_len < 20 &&
+	       write_cb->last_sha1 &&
+	       !memcmp(write_cb->last_sha1, sha1, write_cb->abbrev_len))
+		write_cb->abbrev_len++;
+	/*
+	 * A bit sensitive to metapack_writer_foreach. "sha1" must not
+	 * be changed even after this function exits.
+	 */
+	write_cb->last_sha1 = sha1;
+
+	write_cb->tail = &commit_list_insert(c, write_cb->tail)->next;
 }
 
 void commit_metapack_write(const char *idx)
 {
 	struct metapack_writer mw;
 	struct commit_list *commits = NULL, *p;
-	struct commit_list **tail = &commits;
+	struct write_cb write_cb;
 	uint32_t nr = 0;
 
 	metapack_writer_init(&mw, idx, "commits", 1);
 
+	write_cb.tail = &commits;
+	write_cb.abbrev_len = 1;
+	write_cb.last_sha1 = NULL;
+
 	/* Figure out how many eligible commits we've got in this pack. */
-	metapack_writer_foreach(&mw, get_commits, &tail);
+	metapack_writer_foreach(&mw, get_commits, &write_cb);
 	for (p = commits; p; p = p->next)
 		nr++;
+
 	metapack_writer_add_uint32(&mw, nr);
+	metapack_writer_add_uint32(&mw, write_cb.abbrev_len);
 
 	/* Then write an index of commit sha1s */
 	for (p = commits; p; p = p->next)
-		metapack_writer_add(&mw, p->item->object.sha1, 20);
+		metapack_writer_add(&mw, p->item->object.sha1, write_cb.abbrev_len);
 
 	/* Followed by the actual date/tree/parents data */
 	for (p = commits; p; p = p->next) {
 		struct commit *c = p->item;
+		int pos;
+
+		pos = find_pack_entry_pos(c->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
 		metapack_writer_add_uint32(&mw, c->date);
-		metapack_writer_add(&mw, c->tree->object.sha1, 20);
-		metapack_writer_add(&mw, c->parents->item->object.sha1, 20);
-		metapack_writer_add(&mw,
-				    c->parents->next ?
-				    c->parents->next->item->object.sha1 :
-				    null_sha1, 20);
+
+		pos = find_pack_entry_pos(c->tree->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
+		pos = find_pack_entry_pos(c->parents->item->object.sha1, mw.pack);
+		metapack_writer_add_uint32(&mw, pos);
+
+		if (c->parents->next) {
+			struct object *o = &c->parents->next->item->object;
+			pos = find_pack_entry_pos(o->sha1, mw.pack);
+		}
+		metapack_writer_add_uint32(&mw, pos);
 	}
 
 	metapack_writer_finish(&mw);
diff --git a/commit-metapack.h b/commit-metapack.h
index 4684573..caf85be 100644
--- a/commit-metapack.h
+++ b/commit-metapack.h
@@ -3,9 +3,9 @@
 
 int commit_metapack(unsigned char *sha1,
 		    uint32_t *timestamp,
-		    unsigned char **tree,
-		    unsigned char **parent1,
-		    unsigned char **parent2);
+		    const unsigned char **tree,
+		    const unsigned char **parent1,
+		    const unsigned char **parent2);
 
 void commit_metapack_write(const char *idx_file);
 
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..1acab8c 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1978,8 +1978,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
 	}
 }
 
-off_t find_pack_entry_one(const unsigned char *sha1,
-				  struct packed_git *p)
+int find_pack_entry_pos(const unsigned char *sha1,
+			struct packed_git *p)
 {
 	const uint32_t *level1_ofs = p->index_data;
 	const unsigned char *index = p->index_data;
@@ -1992,7 +1992,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 
 	if (!index) {
 		if (open_pack_index(p))
-			return 0;
+			return -1;
 		level1_ofs = p->index_data;
 		index = p->index_data;
 	}
@@ -2019,9 +2019,7 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 	if (use_lookup) {
 		int pos = sha1_entry_pos(index, stride, 0,
 					 lo, hi, p->num_objects, sha1);
-		if (pos < 0)
-			return 0;
-		return nth_packed_object_offset(p, pos);
+		return pos;
 	}
 
 	do {
@@ -2032,15 +2030,24 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 			printf("lo %u hi %u rg %u mi %u\n",
 			       lo, hi, hi - lo, mi);
 		if (!cmp)
-			return nth_packed_object_offset(p, mi);
+			return mi;
 		if (cmp > 0)
 			hi = mi;
 		else
 			lo = mi+1;
 	} while (lo < hi);
-	return 0;
+	return -1;
 }
 
+off_t find_pack_entry_one(const unsigned char *sha1,
+				  struct packed_git *p)
+{
+	int pos = find_pack_entry_pos(sha1, p);
+	if (pos < 0)
+		return 0;
+	else
+		return nth_packed_object_offset(p, pos);
+}
 int is_pack_valid(struct packed_git *p)
 {
 	/* An already open pack is known to be valid. */
-- 8< --
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]