[PATCH 8/9] revision.c: use bloom filters to speed up path based revision walks

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Garima Singh <garima.singh@xxxxxxxxxxxxx>

If bloom filters have been written to the commit-graph file, revision walk will
use them to speed up revision walks for a particular path.
Note: The current implementation does this in the case of single pathspec
case only.

We load the bloom filters during the prepare_revision_walk step when dealing
with a single pathspec. While comparing trees in rev_compare_trees(), if the
bloom filter says that the file is not different between the two trees, we
don't need to compute the expensive diff. This is where we get our performance
gains.

Performance Gains:
We tested the performance of `git log --path` on the git repo, the linux and
some internal large repos, with a variety of paths of varying depths.

On the git and linux repos:
we observed a 2x to 5x speed up.

On a large internal repo with files seated 6-10 levels deep in the tree:
we observed 10x to 20x speed ups, with some paths going up to 28 times faster.

RFC Notes:
I plan to collect the folloowing statistics around this usage of bloom filters
and trace them out using trace2.
- number of bloom filter queries,
- number of "No" responses (file hasn't changed)
- number of "Maybe" responses (file may have changed)
- number of "Commit not parsed" cases (commit had too many changes to have a
  bloom filter written out, currently our limit is 512 diffs)

Helped-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx
Helped-by: SZEDER Gábor <szeder.dev@xxxxxxxxx>
Helped-by: Jonathan Tan <jonathantanmy@xxxxxxxxxx>
Signed-off-by: Garima Singh <garima.singh@xxxxxxxxxxxxx>
---
 bloom.c              | 20 ++++++++++++
 bloom.h              |  4 +++
 revision.c           | 67 +++++++++++++++++++++++++++++++++++++--
 revision.h           |  5 +++
 t/t4216-log-bloom.sh | 74 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 168 insertions(+), 2 deletions(-)
 create mode 100755 t/t4216-log-bloom.sh

diff --git a/bloom.c b/bloom.c
index 86b1005802..0c7505d3d6 100644
--- a/bloom.c
+++ b/bloom.c
@@ -235,3 +235,23 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	return filter;
 }
+
+int bloom_filter_contains(struct bloom_filter *filter,
+			  struct bloom_key *key,
+			  struct bloom_filter_settings *settings)
+{
+	int i;
+	uint64_t mod = filter->len * BITS_PER_BLOCK;
+
+	if (!mod)
+		return 1;
+
+	for (i = 0; i < settings->num_hashes; i++) {
+		uint64_t hash_mod = key->hashes[i] % mod;
+		uint64_t block_pos = hash_mod / BITS_PER_BLOCK;
+		if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
+			return 0;
+	}
+
+	return 1;
+}
diff --git a/bloom.h b/bloom.h
index 101d689bbd..9bdacd0a8e 100644
--- a/bloom.h
+++ b/bloom.h
@@ -44,4 +44,8 @@ void fill_bloom_key(const char *data,
 		    struct bloom_key *key,
 		    struct bloom_filter_settings *settings);
 
+int bloom_filter_contains(struct bloom_filter *filter,
+			  struct bloom_key *key,
+			  struct bloom_filter_settings *settings);
+
 #endif
diff --git a/revision.c b/revision.c
index 39a25e7a5d..01f5330740 100644
--- a/revision.c
+++ b/revision.c
@@ -29,6 +29,7 @@
 #include "prio-queue.h"
 #include "hashmap.h"
 #include "utf8.h"
+#include "bloom.h"
 
 volatile show_early_output_fn_t show_early_output;
 
@@ -624,11 +625,34 @@ static void file_change(struct diff_options *options,
 	options->flags.has_changes = 1;
 }
 
+static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
+						 struct commit *commit,
+						 struct bloom_key *key,
+						 struct bloom_filter_settings *settings)
+{
+	struct bloom_filter *filter;
+
+	if (!revs->repo->objects->commit_graph)
+		return -1;
+	if (commit->generation == GENERATION_NUMBER_INFINITY)
+		return -1;
+	if (!key || !settings)
+		return -1;
+
+	filter = get_bloom_filter(revs->repo, commit, 0);
+
+	if (!filter || !filter->len)
+		return 1;
+
+	return bloom_filter_contains(filter, key, settings);
+}
+
 static int rev_compare_tree(struct rev_info *revs,
-			    struct commit *parent, struct commit *commit)
+			    struct commit *parent, struct commit *commit, int nth_parent)
 {
 	struct tree *t1 = get_commit_tree(parent);
 	struct tree *t2 = get_commit_tree(commit);
+	int bloom_ret = 1;
 
 	if (!t1)
 		return REV_TREE_NEW;
@@ -653,6 +677,16 @@ static int rev_compare_tree(struct rev_info *revs,
 			return REV_TREE_SAME;
 	}
 
+	if (revs->pruning.pathspec.nr == 1 && !nth_parent) {
+		bloom_ret = check_maybe_different_in_bloom_filter(revs,
+								  commit,
+								  revs->bloom_key,
+								  revs->bloom_filter_settings);
+
+		if (bloom_ret == 0)
+			return REV_TREE_SAME;
+	}
+
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
 	if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
@@ -855,7 +889,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 			die("cannot simplify commit %s (because of %s)",
 			    oid_to_hex(&commit->object.oid),
 			    oid_to_hex(&p->object.oid));
-		switch (rev_compare_tree(revs, p, commit)) {
+		switch (rev_compare_tree(revs, p, commit, nth_parent)) {
 		case REV_TREE_SAME:
 			if (!revs->simplify_history || !relevant_commit(p)) {
 				/* Even if a merge with an uninteresting
@@ -3342,6 +3376,33 @@ static void expand_topo_walk(struct rev_info *revs, struct commit *commit)
 	}
 }
 
+static void prepare_to_use_bloom_filter(struct rev_info *revs)
+{
+	struct pathspec_item *pi;
+	const char *path;
+	size_t len;
+
+	if (!revs->commits)
+	    return;
+
+	parse_commit(revs->commits->item);
+
+	if (!revs->repo->objects->commit_graph)
+		return;
+
+	revs->bloom_filter_settings = revs->repo->objects->commit_graph->settings;
+	if (!revs->bloom_filter_settings)
+		return;
+
+	pi = &revs->pruning.pathspec.items[0];
+	path = pi->match;
+	len = strlen(path);
+
+	load_bloom_filters();
+	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
+	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+}
+
 int prepare_revision_walk(struct rev_info *revs)
 {
 	int i;
@@ -3391,6 +3452,8 @@ int prepare_revision_walk(struct rev_info *revs)
 		simplify_merges(revs);
 	if (revs->children.name)
 		set_children(revs);
+	if (revs->pruning.pathspec.nr == 1)
+	    prepare_to_use_bloom_filter(revs);
 	return 0;
 }
 
diff --git a/revision.h b/revision.h
index a1a804bd3d..65dc11e8f1 100644
--- a/revision.h
+++ b/revision.h
@@ -56,6 +56,8 @@ struct repository;
 struct rev_info;
 struct string_list;
 struct saved_parents;
+struct bloom_key;
+struct bloom_filter_settings;
 define_shared_commit_slab(revision_sources, char *);
 
 struct rev_cmdline_info {
@@ -291,6 +293,9 @@ struct rev_info {
 	struct revision_sources *sources;
 
 	struct topo_walk_info *topo_walk_info;
+
+	struct bloom_key *bloom_key;
+	struct bloom_filter_settings *bloom_filter_settings;
 };
 
 int ref_excluded(struct string_list *, const char *path);
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
new file mode 100755
index 0000000000..d42f077998
--- /dev/null
+++ b/t/t4216-log-bloom.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+
+test_description='git log for a path with bloom filters'
+. ./test-lib.sh
+
+test_expect_success 'setup repo' '
+	git init &&
+	git config core.commitGraph true &&
+	git config gc.writeCommitGraph false &&
+	infodir=".git/objects/info" &&
+	graphdir="$infodir/commit-graphs" &&
+	test_oid_init
+'
+
+test_expect_success 'create 9 commits and repack' '
+	test_commit c1 file1 &&
+	test_commit c2 file2 &&
+	test_commit c3 file3 &&
+	test_commit c4 file1 &&
+	test_commit c5 file2 &&
+	test_commit c6 file3 &&
+	test_commit c7 file1 &&
+	test_commit c8 file2 &&
+	test_commit c9 file3
+'
+
+printf "c7\nc4\nc1" > expect_file1
+
+test_expect_success 'log without bloom filters' '
+	git log --pretty="format:%s"  -- file1 > actual &&
+	test_cmp expect_file1 actual
+'
+
+printf "c8\nc7\nc5\nc4\nc2\nc1" > expect_file1_file2
+
+test_expect_success 'multi-path log without bloom filters' '
+	git log --pretty="format:%s"  -- file1 file2 > actual &&
+	test_cmp expect_file1_file2 actual
+'
+
+graph_read_expect() {
+	OPTIONAL=""
+	NUM_CHUNKS=5
+	if test ! -z $2
+	then
+		OPTIONAL=" $2"
+		NUM_CHUNKS=$((3 + $(echo "$2" | wc -w)))
+	fi
+	cat >expect <<- EOF
+	header: 43475048 1 1 $NUM_CHUNKS 0
+	num_commits: $1
+	chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data$OPTIONAL
+	EOF
+	test-tool read-graph >output &&
+	test_cmp expect output
+}
+
+test_expect_success 'write commit graph with bloom filters' '
+	git commit-graph write --reachable --changed-paths &&
+	test_path_is_file $infodir/commit-graph &&
+	graph_read_expect "9"
+'
+
+test_expect_success 'log using bloom filters' '
+	git log --pretty="format:%s" -- file1 > actual &&
+	test_cmp expect_file1 actual
+'
+
+test_expect_success 'multi-path log using bloom filters' '
+	git log --pretty="format:%s"  -- file1 file2 > actual &&
+	test_cmp expect_file1_file2 actual
+'
+
+test_done
-- 
gitgitgadget




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux