[PATCH 2/3] Add commit cache to help speed up commit traversal

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch extracts tree, parent sha-1 and committer date of packed
commits that have exactly one parent out. So that revision traversal
code can avoid inflating commit for these sha-1. The assumption is
commits with one parent dominate.

Two new files are created per pack: one file contains sha-1 and dates.
The other file contains commit sha-1 mapping to the offset in the former
file.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 Makefile             |    3 +
 builtin/index-pack.c |  113 ++++++++++++++++++++++++++++++++++-
 cache.h              |    9 +++
 pack-write.c         |   11 +++-
 pack.h               |    1 +
 sha1_cache.c         |  161 ++++++++++++++++++++++++++++++++++++++++++++++++++
 sha1_cache.h         |    6 ++
 sha1_file.c          |   12 ++++-
 test-sha1-cache.c    |   19 ++++++
 9 files changed, 331 insertions(+), 4 deletions(-)
 create mode 100644 sha1_cache.c
 create mode 100644 sha1_cache.h
 create mode 100644 test-sha1-cache.c

diff --git a/Makefile b/Makefile
index 87fb30a..fced758 100644
--- a/Makefile
+++ b/Makefile
@@ -481,6 +481,7 @@ TEST_PROGRAMS_NEED_X += test-sha1
 TEST_PROGRAMS_NEED_X += test-sigchain
 TEST_PROGRAMS_NEED_X += test-subprocess
 TEST_PROGRAMS_NEED_X += test-svn-fe
+TEST_PROGRAMS_NEED_X += test-sha1-cache
 
 TEST_PROGRAMS = $(patsubst %,%$X,$(TEST_PROGRAMS_NEED_X))
 
@@ -606,6 +607,7 @@ LIB_H += run-command.h
 LIB_H += sequencer.h
 LIB_H += sha1-array.h
 LIB_H += sha1-lookup.h
+LIB_H += sha1_cache.h
 LIB_H += sideband.h
 LIB_H += sigchain.h
 LIB_H += strbuf.h
@@ -722,6 +724,7 @@ LIB_OBJS += setup.o
 LIB_OBJS += sequencer.o
 LIB_OBJS += sha1-array.o
 LIB_OBJS += sha1-lookup.o
+LIB_OBJS += sha1_cache.o
 LIB_OBJS += sha1_file.o
 LIB_OBJS += sha1_name.o
 LIB_OBJS += shallow.o
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dd1c5c9..67673ef 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -15,6 +15,7 @@ static const char index_pack_usage[] =
 
 struct object_entry {
 	struct pack_idx_entry idx;
+	off_t commit_offset;
 	unsigned long size;
 	unsigned int hdr_size;
 	enum object_type type;
@@ -63,6 +64,7 @@ static int nr_resolved_deltas;
 static int from_stdin;
 static int strict;
 static int verbose;
+static int write_cache = 1;
 
 static struct progress *progress;
 
@@ -75,6 +77,13 @@ static git_SHA_CTX input_ctx;
 static uint32_t input_crc32;
 static int input_fd, output_fd, pack_fd;
 
+static int cache_fd;
+static int cache_written;
+static uint32_t cache_nr;
+static char cache_filename[PATH_MAX];
+static const char *cache_idxname;
+static unsigned char cache_sha1[20];
+
 static int mark_link(struct object *obj, int type, void *data)
 {
 	if (!obj)
@@ -682,6 +691,58 @@ static int compare_delta_entry(const void *a, const void *b)
 				   objects[delta_b->obj_no].type);
 }
 
+static unsigned long parse_commit_date(const char *buf, const char *tail)
+{
+	const char *dateptr;
+
+	if (buf + 6 >= tail)
+		return 0;
+	if (memcmp(buf, "author", 6))
+		return 0;
+	while (buf < tail && *buf++ != '\n')
+		/* nada */;
+	if (buf + 9 >= tail)
+		return 0;
+	if (memcmp(buf, "committer", 9))
+		return 0;
+	while (buf < tail && *buf++ != '>')
+		/* nada */;
+	if (buf >= tail)
+		return 0;
+	dateptr = buf;
+	while (buf < tail && *buf++ != '\n')
+		/* nada */;
+	if (buf >= tail)
+		return 0;
+	/* dateptr < buf && buf[-1] == '\n', so strtoul will stop at buf-1 */
+	return strtoul(dateptr, NULL, 10);
+}
+
+static void write_commit_sha1(struct object_entry *obj, const char *data)
+{
+	unsigned char sha1[20];
+	if (!obj->commit_offset &&
+	    !memcmp(data, "tree ", 5) &&
+	    !memcmp(data + 45, "\nparent ", 8) &&
+	    memcmp(data + 45 + 48, "\nparent ", 8)) {
+		uint32_t date;
+		obj->commit_offset = cache_written;
+		cache_nr++;
+
+		get_sha1_hex(data + 5, sha1);
+		cache_written += xwrite(cache_fd, sha1, 20);
+		get_sha1_hex(data + 5 + 48, sha1);
+		cache_written += xwrite(cache_fd, sha1, 20);
+		date = parse_commit_date(data + 5 + 48 + 41, data + obj->size);
+		date = htonl(date);
+		cache_written += xwrite(cache_fd, &date, 4);
+	}
+#if 0
+	hashclr(sha1);
+	cache_written += xwrite(cache_fd, sha1, 20);
+#endif
+}
+
 /* Parse all objects and return the pack content SHA1 hash */
 static void parse_pack_objects(unsigned char *sha1)
 {
@@ -707,8 +768,13 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
-		} else
+		} else {
 			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+
+			/* We assume that commits are never deltified */
+			if (write_cache && obj->real_type == OBJ_COMMIT)
+				write_commit_sha1(obj, data);
+		}
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -919,6 +985,18 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
 	} else if (from_stdin)
 		chmod(final_pack_name, 0444);
 
+	if (write_cache) {
+		snprintf(name, sizeof(name), "%s/pack/pack-%s.sha1",
+			 get_object_directory(), sha1_to_hex(sha1));
+		if (move_temp_to_file(cache_filename, name))
+			die("cannot store commit cache file");
+
+		snprintf(name, sizeof(name), "%s/pack/pack-%s.sidx",
+			 get_object_directory(), sha1_to_hex(sha1));
+		if (move_temp_to_file(cache_idxname, name))
+			die("cannot store commit cache file");
+	}
+
 	if (final_index_name != curr_index_name) {
 		if (!final_index_name) {
 			snprintf(name, sizeof(name), "%s/pack/pack-%s.idx",
@@ -1120,6 +1198,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 				keep_msg = "";
 			} else if (!prefixcmp(arg, "--keep=")) {
 				keep_msg = arg + 7;
+			} else if (!strcmp(arg, "--no-cache")) {
+				write_cache = 0;
 			} else if (!prefixcmp(arg, "--pack_header=")) {
 				struct pack_header *hdr;
 				char *c;
@@ -1191,6 +1271,17 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	if (strict)
 		opts.flags |= WRITE_IDX_STRICT;
 
+	if (verify)
+		write_cache = 0;
+	if (write_cache) {
+		unsigned char sha1[20];
+		cache_fd = odb_mkstemp(cache_filename,
+				       sizeof(cache_filename),
+				       "pack/tmp_sha1_XXXXXX");
+		hashclr(sha1);
+		cache_written = xwrite(cache_fd, sha1, 20);
+	}
+
 	curr_pack = open_pack_file(pack_name);
 	parse_pack_header();
 	objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
@@ -1243,6 +1334,26 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
 	free(idx_objects);
 
+	if (write_cache) {
+		int nr;
+		struct pack_idx_entry **idx;
+
+		close(cache_fd);
+
+		idx = idx_objects = xmalloc((cache_nr) * sizeof(struct pack_idx_entry *));
+		for (nr = i = 0; i < nr_objects; i++) {
+			if (objects[i].commit_offset) {
+				objects[i].idx.offset = objects[i].commit_offset;
+				*idx++ = &objects[i].idx;
+				nr++;
+			}
+		}
+		assert(nr == cache_nr);
+		opts.flags |= WRITE_IDX_SHA1_CACHE;
+		cache_idxname = write_idx_file(NULL, idx_objects, cache_nr, &opts, cache_sha1);
+		free(idx_objects);
+	}
+
 	if (!verify)
 		final(pack_name, curr_pack,
 		      index_name, curr_index,
diff --git a/cache.h b/cache.h
index 9bd8c2d..fd3953f 100644
--- a/cache.h
+++ b/cache.h
@@ -972,6 +972,14 @@ struct pack_window {
 	unsigned int inuse_cnt;
 };
 
+struct sha1_cache {
+	const void *idx;
+	const void *data;
+	size_t idx_size;
+	size_t data_size;
+	uint32_t nr;
+};
+
 extern struct packed_git {
 	struct packed_git *next;
 	struct pack_window *windows;
@@ -984,6 +992,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	struct sha1_cache commit_cache;
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 do_not_close:1;
diff --git a/pack-write.c b/pack-write.c
index ca9e63b..6da977a 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -85,8 +85,11 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		f = sha1fd(fd, index_name);
 	}
 
-	/* if last object's offset is >= 2^31 we should use index V2 */
-	index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
+	if (opts->flags & WRITE_IDX_SHA1_CACHE)
+		index_version = 2;
+	else
+		/* if last object's offset is >= 2^31 we should use index V2 */
+		index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -138,6 +141,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	if (index_version >= 2) {
 		unsigned int nr_large_offset = 0;
 
+		if (opts->flags & WRITE_IDX_SHA1_CACHE)
+			goto skip_crc32;
+
 		/* write the crc32 table */
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
@@ -146,6 +152,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 			sha1write(f, &crc32_val, 4);
 		}
 
+skip_crc32:
 		/* write the 32-bit offset table */
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
diff --git a/pack.h b/pack.h
index aa6ee7d..f002b6a 100644
--- a/pack.h
+++ b/pack.h
@@ -40,6 +40,7 @@ struct pack_idx_option {
 	/* flag bits */
 #define WRITE_IDX_VERIFY 01 /* verify only, do not write the idx file */
 #define WRITE_IDX_STRICT 02
+#define WRITE_IDX_SHA1_CACHE 04
 
 	uint32_t version;
 	uint32_t off32_limit;
diff --git a/sha1_cache.c b/sha1_cache.c
new file mode 100644
index 0000000..3f244bb
--- /dev/null
+++ b/sha1_cache.c
@@ -0,0 +1,161 @@
+#include "cache.h"
+#include "pack.h"
+#include "sha1_cache.h"
+
+static int git_open_noatime(const char *name)
+{
+	static int sha1_file_open_flag = O_NOATIME;
+
+	for (;;) {
+		int fd = open(name, O_RDONLY | sha1_file_open_flag);
+		if (fd >= 0)
+			return fd;
+
+		/* Might the failure be due to O_NOATIME? */
+		if (errno != ENOENT && sha1_file_open_flag) {
+			sha1_file_open_flag = 0;
+			continue;
+		}
+
+		return -1;
+	}
+}
+
+/*
+ * Commit cache format is basically pack index v2 except that:
+ * - no crc32 table
+ * - no large offset support
+ * - offset table contains offset in _this_ file, in the commit
+ *   cache section
+ * - the commit cache section follows offset table, each entry
+ *   consists of tree sha1, parent sha1 if any, and terminated
+ *   by null sha-1.
+ */
+
+int open_sha1_cache(struct sha1_cache *cache,
+		    const char *data_path, const char *idx_path)
+{
+	void *idx_map, *data;
+	struct pack_idx_header *hdr;
+	size_t idx_size, data_size;
+	uint32_t nr, i, *index;
+	int fd = git_open_noatime(idx_path);
+	struct stat st;
+
+	if (fd < 0)
+		return -1;
+	if (fstat(fd, &st)) {
+		close(fd);
+		return -1;
+	}
+	idx_size = xsize_t(st.st_size);
+	if (idx_size < 4 * 256 + 20 + 20) {
+		close(fd);
+		return error("index file %s is too small", idx_path);
+	}
+	idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	fd = git_open_noatime(data_path);
+	if (fd < 0)
+		return -1;
+	if (fstat(fd, &st)) {
+		close(fd);
+		return -1;
+	}
+	data_size = xsize_t(st.st_size);
+	data = xmmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	hdr = idx_map;
+	if (hdr->idx_signature != htonl(PACK_IDX_SIGNATURE))
+		return error("not a commit cache file %s", idx_path);
+	if (ntohl(hdr->idx_version) != 2)
+		return error("wrong version %s %d", idx_path, ntohl(hdr->idx_version));
+
+	nr = 0;
+	index = idx_map;
+	index += 2;  /* skip index header */
+	for (i = 0; i < 256; i++) {
+		uint32_t n = ntohl(index[i]);
+		if (n < nr) {
+			munmap(idx_map, idx_size);
+			return error("non-monotonic index %s", idx_path);
+		}
+		nr = n;
+	}
+
+	cache->idx = idx_map;
+	cache->idx_size = idx_size;
+	cache->nr = nr;
+	cache->data = data;
+	cache->data_size = data_size;
+	return 0;
+}
+
+static const void *object_offset(const struct sha1_cache *cache, uint32_t n)
+{
+	const unsigned char *index = cache->idx;
+	uint32_t off;
+	index += 4 * 256;
+	index += 8 + cache->nr * 20;
+	off = ntohl(*((uint32_t *)(index + 4 * n)));
+	return (const char *)cache->data + off;
+}
+
+const void *find_sha1_in_cache(const unsigned char *sha1)
+{
+	const uint32_t *level1_ofs;
+	const unsigned char *index;
+	unsigned hi, lo, stride;
+
+	struct packed_git *p = find_sha1_pack(sha1, packed_git);
+	if (!p)
+		return 0;
+
+	open_pack_index(p);
+	if (!p->commit_cache.nr)
+		return 0;
+
+	level1_ofs = p->commit_cache.idx;
+	index = p->commit_cache.idx;
+
+	/* skip header */
+	level1_ofs += 2;
+	index += 8;
+
+	/* fanout table */
+	index += 4 * 256;
+	hi = ntohl(level1_ofs[*sha1]);
+	lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
+	stride = 20;
+
+	do {
+		unsigned mi = (lo + hi) / 2;
+		int cmp = hashcmp(index + mi * stride, sha1);
+
+		if (!cmp)
+			return object_offset(&p->commit_cache, mi);
+		if (cmp > 0)
+			hi = mi;
+		else
+			lo = mi+1;
+	} while (lo < hi);
+	return NULL;
+}
+
+int has_commit_cache(const unsigned char *sha1,
+		     unsigned char *tree,
+		     unsigned char *parent,
+		     unsigned long *date)
+{
+	const unsigned char *data;
+	data = find_sha1_in_cache(sha1);
+	if (!data)
+		return 0;
+
+	hashcpy(tree, data);
+	hashcpy(parent, data + 20);
+	*date = ntohl(*((uint32_t*)(data + 40)));
+	return 1;
+}
diff --git a/sha1_cache.h b/sha1_cache.h
new file mode 100644
index 0000000..59823cf
--- /dev/null
+++ b/sha1_cache.h
@@ -0,0 +1,6 @@
+extern int open_sha1_cache(struct sha1_cache *cache,
+			   const char *data_path, const char *idx_path);
+extern const void *find_sha1_in_cache(const unsigned char *sha1);
+extern int has_commit_cache(const unsigned char *sha1,
+			    unsigned char *tree, unsigned char *parent,
+			    unsigned long *date);
diff --git a/sha1_file.c b/sha1_file.c
index 88f2151..fc2f460 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -19,6 +19,7 @@
 #include "pack-revindex.h"
 #include "sha1-lookup.h"
 #include "bulk-checkin.h"
+#include "sha1_cache.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -578,14 +579,23 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 int open_pack_index(struct packed_git *p)
 {
 	char *idx_name;
+	int baselen = strlen(p->pack_name) - strlen(".pack");
 	int ret;
 
 	if (p->index_data)
 		return 0;
 
 	idx_name = xstrdup(p->pack_name);
-	strcpy(idx_name + strlen(idx_name) - strlen(".pack"), ".idx");
+	strcpy(idx_name + baselen, ".idx");
 	ret = check_packed_git_idx(idx_name, p);
+	if (!ret) {
+		char *cache_name;
+		cache_name = xstrdup(p->pack_name);
+		strcpy(idx_name + baselen, ".sidx");
+		strcpy(cache_name + baselen, ".sha1");
+		open_sha1_cache(&p->commit_cache, cache_name, idx_name);
+		free(cache_name);
+	}
 	free(idx_name);
 	return ret;
 }
diff --git a/test-sha1-cache.c b/test-sha1-cache.c
new file mode 100644
index 0000000..7b7f32c
--- /dev/null
+++ b/test-sha1-cache.c
@@ -0,0 +1,19 @@
+#include "cache.h"
+#include "sha1_cache.h"
+
+int main(int argc, char **argv)
+{
+	unsigned char sha1[20];
+	unsigned char tree[20];
+	unsigned char parent[20];
+	unsigned long date;
+
+	setup_git_directory();
+	prepare_packed_git();
+	get_sha1_hex(argv[1], sha1);
+	if (!has_commit_cache(sha1, tree, parent, &date))
+		return 1;
+	printf("tree %s\nparent %s\ndate %ld\n",
+	       sha1_to_hex(tree), sha1_to_hex(parent), date);
+	return 0;
+}
-- 
1.7.3.1.256.g2539c.dirty

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]