Re: [PATCH v4 05/15] diff: halt tree-diff early after max_changes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> From: Derrick Stolee <dstolee@xxxxxxxxxxxxx>
> 
> When computing the changed-paths bloom filters for the commit-graph,
> we limit the size of the filter by restricting the number of paths
> in the diff. Instead of computing a large diff and then ignoring the
> result, it is better to halt the diff computation early.
> 
> Create a new "max_changes" option in struct diff_options. If non-zero,
> then halt the diff computation after discovering strictly more changed
> paths. This includes paths corresponding to trees that change.
> 
> Use this max_changes option in the bloom filter calculations. This
> reduces the time taken to compute the filters for the Linux kernel
> repo from 2m50s to 2m35s. On a large internal repository with ~500
> commits that perform tree-wide changes, the time reduced from
> 6m15s to 3m48s.
> 
> Signed-off-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx>
> Signed-off-by: Garima Singh <garima.singh@xxxxxxxxxxxxx>
> ---
>  bloom.c     | 4 +++-
>  diff.h      | 5 +++++
>  tree-diff.c | 6 ++++++
>  3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/bloom.c b/bloom.c
> index 881a9841ede..a16eee92331 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -133,6 +133,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
>  	int i;
>  	struct diff_options diffopt;
> +	int max_changes = 512;
>  
>  	if (bloom_filters.slab_size == 0)
>  		return NULL;
> @@ -141,6 +142,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  
>  	repo_diff_setup(r, &diffopt);
>  	diffopt.flags.recursive = 1;
> +	diffopt.max_changes = max_changes;
>  	diff_setup_done(&diffopt);
>  
>  	if (c->parents)
> @@ -149,7 +151,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
>  	diffcore_std(&diffopt);
>  
> -	if (diff_queued_diff.nr <= 512) {
> +	if (diff_queued_diff.nr <= max_changes) {
>  		struct hashmap pathmap;
>  		struct pathmap_hash_entry *e;
>  		struct hashmap_iter iter;
> diff --git a/diff.h b/diff.h
> index 6febe7e3656..9443dc1b003 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -285,6 +285,11 @@ struct diff_options {
>  	/* Number of hexdigits to abbreviate raw format output to. */
>  	int abbrev;
>  
> +	/* If non-zero, then stop computing after this many changes. */
> +	int max_changes;
> +	/* For internal use only. */
> +	int num_changes;

"For internal use only", understood.

> +
>  	int ita_invisible_in_index;
>  /* white-space error highlighting */
>  #define WSEH_NEW (1<<12)
> diff --git a/tree-diff.c b/tree-diff.c
> index 33ded7f8b3e..f3d303c6e54 100644
> --- a/tree-diff.c
> +++ b/tree-diff.c
> @@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		if (diff_can_quit_early(opt))
>  			break;
>  
> +		if (opt->max_changes && opt->num_changes > opt->max_changes)
> +			break;
> +
>  		if (opt->pathspec.nr) {
>  			skip_uninteresting(&t, base, opt);
>  			for (i = 0; i < nparent; i++)
> @@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  
>  			/* t↓ */
>  			update_tree_entry(&t);
> +			opt->num_changes++;
>  		}
>  
>  		/* t > p[imin] */
> @@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		skip_emit_tp:
>  			/* ∀ pi=p[imin]  pi↓ */
>  			update_tp_entries(tp, nparent);
> +			opt->num_changes++;
>  		}
>  	}

This counter is basically broken, its value is wrong for over 98% of
commits, and, worse, its value remains 0 for over 85% of commits in
the repositories I usually use to test modified path Bloom filters.
Consequently, a relatively large number of commits modifying more than
512 paths get Bloom filters.

The makeshift tests in the patch below demonstrate these issues as
most of them fail, most notably those two tests that demonstrate that
modifying existing paths are not counted at all.


  ---  >8  ---

diff --git a/bloom.c b/bloom.c
index 9b86aa3f59..3db0fde734 100644
--- a/bloom.c
+++ b/bloom.c
@@ -203,7 +203,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.detect_rename = 0;
-	diffopt.max_changes = max_changes;
+	diffopt.max_changes = 0;
 	diff_setup_done(&diffopt);
 
 	/* ensure commit is parsed so we have parent information */
@@ -214,6 +214,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	else
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
+	printf("%s  %d\n", oid_to_hex(&c->object.oid), diffopt.num_changes);
 
 	if (diffopt.num_changes <= max_changes) {
 		struct hashmap pathmap;
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 0000000000..8d2bd9f03f
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,142 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	test_tick &&
+
+	echo 1 >file &&
+	mkdir -p dir/subdir &&
+	echo 1 >dir/subdir/file1 &&
+	echo 1 >dir/subdir/file2 &&
+	git add file dir &&
+	git commit -m setup &&
+
+	echo 2 >file &&
+	git commit -a -m "modify one path in root" &&
+	mod_one_path=$(git rev-parse HEAD) &&
+
+	echo 2 >dir/subdir/file1 &&
+	echo 2 >dir/subdir/file2 &&
+	git commit -a -m "modify two file two dirs deep" &&
+	mod_four_paths=$(git rev-parse HEAD) &&
+
+	>new-file &&
+	git add new-file &&
+	git commit -m "add new file in root" &&
+	new_file_in_root=$(git rev-parse HEAD) &&
+
+	git rm new-file &&
+	git commit -m "delete file in root" &&
+	delete_file_in_root=$(git rev-parse HEAD) &&
+
+	>dir/new-file &&
+	git add dir/new-file &&
+	git commit -m "add new file in dir" &&
+	new_file_in_dir=$(git rev-parse HEAD) &&
+
+	git rm dir/new-file &&
+	git commit -m "delete file in dir" &&
+	delete_file_in_dir=$(git rev-parse HEAD) &&
+
+	echo 1 >d-f &&
+	git add d-f &&
+	git commit -m foo &&
+	git rm d-f &&
+	mkdir d-f &&
+	echo 2 >d-f/file &&
+	git add d-f &&
+	git commit -m "replace file with dir" &&
+	file_to_dir=$(git rev-parse HEAD) &&
+
+	>d-f.c &&
+	git add d-f.c &&
+	git commit -m "add a file that sorts between d-f and d-f/" &&
+	git rm -r d-f &&
+	echo 3 >d-f &&
+	git add d-f &&
+	git commit -m "replace dir with file" &&
+	dir_to_file=$(git rev-parse HEAD) &&
+
+	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
+	# leading zero in mode: the content of the tree remains the same,
+	# but its oid does change!
+	printf "040000 subdir\0$bin_sha1" >rawtree &&
+	tree1=$(git hash-object -t tree -w rawtree) &&
+	git cat-file -p HEAD^{tree} >out &&
+	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
+	different_but_same_tree=$(git commit-tree \
+		-m "leading zeros in mode" \
+		-p $(git rev-parse HEAD) $tree2) &&
+	git update-ref HEAD $different_but_same_tree &&
+
+	git commit-graph write --reachable --changed-paths >out &&
+	cat out  # debug
+'
+
+test_expect_success 'modify one path in root' '
+	git diff --name-status $mod_one_path^ $mod_one_path &&
+	echo "$mod_one_path  1" >expect &&
+	grep "$mod_one_path" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'modify two file two dirs deep' '
+	git diff --name-status $mod_four_paths^ $mod_four_paths &&
+	echo "$mod_four_paths  4" >expect &&
+	grep "$mod_four_paths" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in root' '
+	git diff --name-status $new_file_in_root^ $new_file_in_root &&
+	echo "$new_file_in_root  1" >expect &&
+	grep "$new_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in root' '
+	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
+	echo "$delete_file_in_root  1" >expect &&
+	grep "$delete_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in dir' '
+	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
+	echo "$new_file_in_dir  2" >expect &&
+	grep "$new_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in dir' '
+	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
+	echo "$delete_file_in_dir  2" >expect &&
+	grep "$delete_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace file with dir' '
+	git diff --name-status $file_to_dir^ $file_to_dir &&
+	echo "$file_to_dir  2" >expect &&
+	grep "$file_to_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace dir with file' '
+	git diff --name-status $dir_to_file^ $dir_to_file &&
+	echo "$dir_to_file  2" >expect &&
+	grep "$dir_to_file" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'leading zeros in mode' '
+	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
+	echo "$different_but_same_tree  0" >expect &&
+	grep "$different_but_same_tree" out >actual &&
+	test_cmp expect actual
+'
+
+test_done

  ---  >8  ---


> @@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
>  	const struct object_id **parents_oid, int nparent,
>  	struct strbuf *base, struct diff_options *opt)
>  {
> +	opt->num_changes = 0;
>  	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
>  
>  	/*
> -- 
> gitgitgadget
> 



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux