Re: [PATCH] cherry: cache patch-ids to avoid repeating work

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add cached-sha-map.[ch] implementing a persistent hash map from sha1 to
sha1.  The map is read with mmap, and completely rewritten if any entries
change.  It would be good to add incremental update to handle the usual case
where only a few entries change.

This structure is used by patch-ids.c to cache the mapping from commit to
patch-id into $GIT_DIR/patch-id-cache.  In the one case I've tested so far,
this speeds up the second invocation of git-cherry by two orders of
magnitude.  The caching can be disabled by setting cherry.cachepatchids to
false.

Original code cannibalized from Johannes Schindelin's notes-index structure.

Signed-off-by: Geoffrey Irving <irving@xxxxxxx>
---

Here's an updated version that avoids infinite loops and adds a sha1
checksum of the header.  It is still vastly more likely that this code
will return incorrect results due to disk corruption than that the old
version would infinite loop.  If we want to be even more paranoid, we
could add a checksum for every 511 entries, but I'm hoping that isn't
required. :)

Your version of the infinite loop avoidance didn't quite work, since
I'm already using every 32 bit return value in find_helper.

I also fixed the 4/3 check to not overflow.

 Documentation/config.txt |    5 +
 Makefile                 |    2 +
 builtin-log.c            |   12 ++
 cached-sha1-map.c        |  293 ++++++++++++++++++++++++++++++++++++++++++++++
 cached-sha1-map.h        |   45 +++++++
 patch-ids.c              |   26 ++++-
 patch-ids.h              |    2 +
 7 files changed, 384 insertions(+), 1 deletions(-)
 create mode 100644 cached-sha1-map.c
 create mode 100644 cached-sha1-map.h

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 838794d..02b8113 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -468,6 +468,11 @@ browser.<tool>.path::
 	browse HTML help (see '-w' option in linkgit:git-help[1]) or a
 	working repository in gitweb (see linkgit:git-instaweb[1]).

+cherry.cachepatchids::
+	If true, linkgit:git-cherry will store a cache of computed patch-ids
+	in $GIT_DIR/patch-id-cache in order to make repeated invocations faster.
+	Defaults to true.
+
 clean.requireForce::
 	A boolean to make git-clean do nothing unless given -f
 	or -n.   Defaults to true.
diff --git a/Makefile b/Makefile
index 4796565..f7360e1 100644
--- a/Makefile
+++ b/Makefile
@@ -356,6 +356,7 @@ LIB_H += pack-refs.h
 LIB_H += pack-revindex.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
+LIB_H += cached-sha1-map.h
 LIB_H += path-list.h
 LIB_H += pkt-line.h
 LIB_H += progress.h
@@ -436,6 +437,7 @@ LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
+LIB_OBJS += cached-sha1-map.o
 LIB_OBJS += path-list.o
 LIB_OBJS += path.o
 LIB_OBJS += pkt-line.o
diff --git a/builtin-log.c b/builtin-log.c
index 430d876..fbfefbd 100644
--- a/builtin-log.c
+++ b/builtin-log.c
@@ -1081,6 +1081,16 @@ static int add_pending_commit(const char *arg,
struct rev_info *revs, int flags)
 	return -1;
 }

+static int git_cherry_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "cherry.cachepatchids")) {
+		cache_patch_ids = git_config_bool(var, value);
+		return 0;
+	}
+
+	return 0;
+}
+
 static const char cherry_usage[] =
 "git-cherry [-v] <upstream> [<head>] [<limit>]";
 int cmd_cherry(int argc, const char **argv, const char *prefix)
@@ -1094,6 +1104,8 @@ int cmd_cherry(int argc, const char **argv,
const char *prefix)
 	const char *limit = NULL;
 	int verbose = 0;

+	git_config(git_cherry_config, NULL);
+
 	if (argc > 1 && !strcmp(argv[1], "-v")) {
 		verbose = 1;
 		argc--;
diff --git a/cached-sha1-map.c b/cached-sha1-map.c
new file mode 100644
index 0000000..9cf7252
--- /dev/null
+++ b/cached-sha1-map.c
@@ -0,0 +1,293 @@
+#include "cached-sha1-map.h"
+
+union cached_sha1_map_header {
+	struct {
+		char signature[4]; /* CS1M */
+		uint32_t version;
+		uint32_t count;
+		uint32_t size;
+		uint32_t pad; /* pad to 20 bytes */
+	} u;
+	/* pad header out to 40 bytes.  As a consistency
+	 * check, pad.value stores the sha1 of pad.key. */
+	struct cached_sha1_entry pad;
+};
+
+static const char *signature = "CS1M";
+static const uint32_t version = 1;
+
+static int init_empty_map(struct cached_sha1_map *cache, uint32_t size)
+{
+	cache->count = 0;
+	cache->size = size;
+	cache->initialized = 1;
+	cache->mmapped = 0;
+	cache->dirty = 1;
+
+	cache->entries = calloc(size, sizeof(struct cached_sha1_entry));
+	if (!cache->entries) {
+		warning("failed to allocate empty map of size %"PRIu32" for %s",
+			size, git_path(cache->filename));
+		cache->size = 0;
+		cache->dirty = 0;
+		return -1;
+	}
+	return 0;
+}
+
+static int grow_map(struct cached_sha1_map *cache)
+{
+	struct cached_sha1_map new_cache;
+	uint32_t i;
+
+	if (cache->size * 2 == 0) {
+		warning("%s overflowed, so resetting to empty",
+			git_path(cache->filename));
+		return init_empty_map(cache, 64);
+	}
+
+	/* allocate cache with twice the size */
+	new_cache.filename = cache->filename;
+	if (init_empty_map(&new_cache, cache->size * 2)) {
+		warning("failed to grow %s to size %"PRIu32,
+			git_path(cache->filename), cache->size * 2);
+		return init_empty_map(cache, 64);
+	}
+
+	/* reinsert all entries */
+ 	for (i = 0; i < cache->size; i++)
+		if (!is_null_sha1(cache->entries[i].key))
+			set_cached_sha1_entry(&new_cache,
+				cache->entries[i].key, cache->entries[i].value);
+	/* finish */
+	free_cached_sha1_map(cache);
+	*cache = new_cache;
+	return 0;
+}
+
+/* Any errors that occur result in the cache being initialized to empty */
+static int init_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	int fd;
+	union cached_sha1_map_header header;
+	const char *filename;
+	size_t map_size;
+	SHA_CTX ctx;
+
+	if (cache->initialized)
+		return cache->size ? 0 : -1;
+
+	filename = git_path(cache->filename);
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			warning("failed to read '%s': %s", filename,
+				strerror(errno));
+		goto empty;
+	}
+
+	if (read_in_full(fd, &header, sizeof(header)) != sizeof(header)) {
+		warning("cannot read %s header", filename);
+		goto empty;
+	}
+
+	if (memcmp(header.u.signature, signature, 4)) {
+		warning("%s has invalid header", filename);
+		goto empty;
+	}
+
+	if (ntohl(header.u.version) != version) {
+		warning("%s has unrecognized version %"PRIu32, filename,
+			ntohl(header.u.version));
+		goto empty;
+	}
+
+	cache->count = ntohl(header.u.count);
+	cache->size = ntohl(header.u.size);
+
+	if (cache->size & (cache->size-1)) {
+		warning("%s is corrupt: size %"PRIu32" is not a power of two",
+			filename, cache->size);
+		goto empty;
+	}
+
+	if (cache->count >= cache->size) {
+		warning("%s is corrupt: count %"PRIu32" >= size %"PRIu32,
+			filename, cache->count, cache->size);
+		goto empty;
+	}
+
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.key, &ctx); /* reuse pad.key to store its sha1 */
+	if (hashcmp(header.pad.key, header.pad.value)) {
+		warning("%s header has invalid sha1", filename);
+		goto empty;
+	}
+
+	cache->dirty = 0;
+	cache->initialized = 1;
+	cache->mmapped = 1;
+
+	/* mmap entire file so that file / memory blocks are aligned */
+	map_size = sizeof(struct cached_sha1_entry) * (cache->size + 1);
+	cache->entries = mmap(NULL, map_size,
+		PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (cache->entries == MAP_FAILED) {
+		/* this is just a cache, so don't free pack memory and retry */
+		warning("%s mmap failed: %s", filename, strerror(errno));
+		goto empty;
+	}
+	cache->entries += 1; /* skip header */
+	return 0;
+
+empty:
+	if (fd >= 0)
+		close(fd);
+	return init_empty_map(cache, 64);
+}
+
+int write_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	union cached_sha1_map_header header;
+	struct lock_file update_lock;
+	int fd;
+	size_t map_size;
+	const char *filename;
+	SHA_CTX ctx;
+
+	if (!cache->initialized || !cache->dirty)
+		return 0;
+
+	filename = git_path(cache->filename);
+	fd = hold_lock_file_for_update(&update_lock, filename, 0);
+
+	if (fd < 0)
+	{
+		warning("could not construct %s", filename);
+		return -1;
+	}
+
+	/* initialize header */
+	memcpy(header.u.signature, signature, 4);
+	header.u.version = htonl(version);
+	header.u.count = htonl(cache->count);
+	header.u.size = htonl(cache->size);
+	header.u.pad = 0; /* make header deterministic */
+
+	/* compute header sha1 */
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, header.pad.key, 20);
+	SHA1_Final(header.pad.value, &ctx);
+
+	map_size = sizeof(struct cached_sha1_entry) * cache->size;
+	if (write_in_full(fd, &header, sizeof(header)) != sizeof(header)
+		|| write_in_full(fd, cache->entries, map_size) != map_size)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	if (commit_lock_file(&update_lock) < 0)
+	{
+		warning("could not write %s", filename);
+		return -1;
+	}
+
+	cache->dirty = 0;
+	return 0;
+}
+
+void free_cached_sha1_map(struct cached_sha1_map *cache)
+{
+	if (!cache->initialized)
+		return;
+
+	if (cache->mmapped)
+		munmap(cache->entries - 1,
+			sizeof(struct cached_sha1_entry) * (cache->size + 1));
+	else
+		free(cache->entries);
+}
+
+/* The fact that size is a power of two means count-1 <= INT32_MAX, so it
+ * is safe to return signed integers here. */
+static int32_t get_hash_index(const unsigned char *sha1)
+{
+	/* this is alignment safe since 40 is a multiple of 4 */
+	return ntohl(*(uint32_t*)sha1);
+}
+
+/*
+ * Returns the index if the entry exists, and the complemented index of
+ * the next free entry otherwise.  If the hash is full, returns the
+ * complement of a nonfree entry and sets count = size (this happens
+ * only if the file is corrupt).
+ */
+static int32_t find_helper(struct cached_sha1_map *cache,
+	const unsigned char *key)
+{
+	int32_t i, mask, full;
+
+	mask = cache->size - 1;
+	i = get_hash_index(key) & mask;
+	full = (i-1) & mask;
+
+	for (; ; i = (i+1) & mask) {
+		if (!hashcmp(key, cache->entries[i].key))
+			return i;
+		else if (is_null_sha1(cache->entries[i].key) || i == full)
+			return ~i;
+		if (i == full) {
+			cache->count = cache->size; /* fix count */
+			return ~1;
+		}
+	}
+}
+
+int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, unsigned char *value)
+{
+	int32_t i;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+	if(i < 0)
+		return -1;
+
+	/* entry found, return value */
+	hashcpy(value, cache->entries[i].value);
+	return 0;
+}
+
+int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value)
+{
+	int32_t i;
+	struct cached_sha1_entry *entry;
+
+	if (init_cached_sha1_map(cache))
+		return -1;
+
+	i = find_helper(cache, key);
+
+	if (i < 0) { /* write new entry */
+		entry = cache->entries + ~i;
+		hashcpy(entry->key, key);
+		hashcpy(entry->value, value);
+		cache->count++;
+		cache->dirty = 1;
+	} else { /* overwrite existing entry */
+		entry = cache->entries + i;
+		if (hashcmp(value, entry->value)) {
+			hashcpy(entry->value, value);
+			cache->dirty = 1;
+		}
+	}
+
+	if (cache->count >= cache->size/4*3)
+		return grow_map(cache);
+	return 0;
+}
diff --git a/cached-sha1-map.h b/cached-sha1-map.h
new file mode 100644
index 0000000..296c17c
--- /dev/null
+++ b/cached-sha1-map.h
@@ -0,0 +1,45 @@
+#ifndef CACHED_SHA1_MAP_H
+#define CACHED_SHA1_MAP_H
+
+#include "cache.h"
+
+/*
+ * A cached-sha1-map is a file storing a hash map from sha1 to sha1.
+ *
+ * The file is mmap'ed, updated in memory during operation, and flushed
+ * back to disk when freed.  Currently the entire file is rewritten for
+ * any change.  This could be a significant bottleneck for common uses,
+ * so it would be good to fix this later if possible.
+ *
+ * The performance of a hash map depends highly on a good hashing
+ * algorithm, to avoid collisions.  Lucky us!  SHA-1 is a pretty good
+ * hashing algorithm.
+ */
+
+struct cached_sha1_entry {
+	unsigned char key[20];
+	unsigned char value[20];
+};
+
+struct cached_sha1_map {
+	const char *filename; /* relative to GIT_DIR */
+
+	/* rest is for internal use */
+	uint32_t count, size;
+	unsigned int initialized : 1;
+	unsigned int dirty : 1;
+	unsigned int mmapped : 1;
+	struct cached_sha1_entry *entries; /* pointer to mmap'ed memory + 1 */
+};
+
+extern int get_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key,unsigned char *value);
+
+extern int set_cached_sha1_entry(struct cached_sha1_map *cache,
+	const unsigned char *key, const unsigned char *value);
+
+extern int write_cached_sha1_map(struct cached_sha1_map *cache);
+
+extern void free_cached_sha1_map(struct cached_sha1_map *cache);
+
+#endif
diff --git a/patch-ids.c b/patch-ids.c
index 3be5d31..663ffee 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -2,17 +2,36 @@
 #include "diff.h"
 #include "commit.h"
 #include "patch-ids.h"
+#include "cached-sha1-map.h"
+
+int cache_patch_ids = 1;
+static struct cached_sha1_map patch_id_cache;

 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
 {
+	int ret;
+
+	/* pull patch-id out of the cache if possible */
+	patch_id_cache.filename = "patch-id-cache";
+	if (cache_patch_ids && !get_cached_sha1_entry(&patch_id_cache,
+			commit->object.sha1, sha1))
+		return 0;
+
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
 		               commit->object.sha1, "", options);
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	ret = diff_flush_patch_id(options, sha1);
+	if (ret)
+		return ret;
+
+	/* record commit, patch-id pair in cache */
+	if (cache_patch_ids)
+		set_cached_sha1_entry(&patch_id_cache, commit->object.sha1, sha1);
+	return 0;
 }

 static uint32_t take2(const unsigned char *id)
@@ -136,6 +155,11 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	/* write cached patch-ids and ignore any errors that arise
+	 * (e.g. if the repository is write protected) */
+	if (cache_patch_ids)
+		write_cached_sha1_map(&patch_id_cache);
 	return 0;
 }

diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c0ebdc1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -18,4 +18,6 @@ int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
 struct patch_id *has_commit_patch_id(struct commit *, struct patch_ids *);

+extern int cache_patch_ids;
+
 #endif /* PATCH_IDS_H */
-- 
1.5.6.2.256.g33ad.dirty
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux