Re: [PATCH v4 05/15] diff: halt tree-diff early after max_changes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/4/2020 10:47 AM, SZEDER Gábor wrote:
> On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> This counter is basically broken, its value is wrong for over 98% of
> commits, and, worse, its value remains 0 for over 85% of commits in
> the repositories I usually use to test modified path Bloom filters.
> Consequently, a relatively large number of commits modifying more than
> 512 paths get Bloom filters.

Thanks for finding this! The counter is only really tested in one
place, and that test only considers _file adds_, which is a problem.

If I understand this correctly, the bug is a performance-only bug
(since this is a performance-only feature), but it is an important
one to fix.

There is certainly some dark magic happening in this tree-diff logic,
so instead of trying to get an accurate count we should just use the
magic global diff_queued_diff to track the current list of file changes.

Note: diff_queued_diff does not track the directory changes, so it
is an under-count for the total changes to track in the Bloom filter.
This is later corrected by the block that adds these leading directory
changes.

> The makeshift tests in the patch below demonstrate these issues as
> most of them fail, most notably those two tests that demonstrate that
> modifying existing paths are not counted at all.

I adapted your diff along with ripping out 'num_changes' in favor
of diff_queued_diff.nr. This required modifying some of your expected
values in the test script (losing the leading directories in the
count).

I'll work with Taylor to create a fix, and include proper testing
of the logic here. We'll stick it in the v2 of his max-changed-paths
series [1]. He already has some helpful logging that can help create
tests that ensure this logic is performing as expected.

We plan to have that fix available by later today or early tomorrow.
Will you be available to help validate it?

[1] https://lore.kernel.org/git/cover.1596480582.git.me@xxxxxxxxxxxx/

Thanks,
-Stolee

  --- >8 ---

diff --git a/bloom.c b/bloom.c
index 1a573226e7..b8d6cb9240 100644
--- a/bloom.c
+++ b/bloom.c
@@ -218,8 +218,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	else
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
+	printf("%s  %d\n", oid_to_hex(&c->object.oid), diff_queued_diff.nr);
 
-	if (diffopt.num_changes <= max_changes) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index e0c0af6286b..1d32b718857 100644
--- a/diff.h
+++ b/diff.h
@@ -287,8 +287,6 @@ struct diff_options {
 
 	/* If non-zero, then stop computing after this many changes. */
 	int max_changes;
-	/* For internal use only. */
-	int num_changes;
 
 	int ita_invisible_in_index;
 /* white-space error highlighting */
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
new file mode 100755
index 00000000000..1f35aa8e2c5
--- /dev/null
+++ b/t/t9999-test.sh
@@ -0,0 +1,142 @@
+#!/bin/sh
+
+test_description='test'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	test_tick &&
+
+	echo 1 >file &&
+	mkdir -p dir/subdir &&
+	echo 1 >dir/subdir/file1 &&
+	echo 1 >dir/subdir/file2 &&
+	git add file dir &&
+	git commit -m setup &&
+
+	echo 2 >file &&
+	git commit -a -m "modify one path in root" &&
+	mod_one_path=$(git rev-parse HEAD) &&
+
+	echo 2 >dir/subdir/file1 &&
+	echo 2 >dir/subdir/file2 &&
+	git commit -a -m "modify two file two dirs deep" &&
+	mod_four_paths=$(git rev-parse HEAD) &&
+
+	>new-file &&
+	git add new-file &&
+	git commit -m "add new file in root" &&
+	new_file_in_root=$(git rev-parse HEAD) &&
+
+	git rm new-file &&
+	git commit -m "delete file in root" &&
+	delete_file_in_root=$(git rev-parse HEAD) &&
+
+	>dir/new-file &&
+	git add dir/new-file &&
+	git commit -m "add new file in dir" &&
+	new_file_in_dir=$(git rev-parse HEAD) &&
+
+	git rm dir/new-file &&
+	git commit -m "delete file in dir" &&
+	delete_file_in_dir=$(git rev-parse HEAD) &&
+
+	echo 1 >d-f &&
+	git add d-f &&
+	git commit -m foo &&
+	git rm d-f &&
+	mkdir d-f &&
+	echo 2 >d-f/file &&
+	git add d-f &&
+	git commit -m "replace file with dir" &&
+	file_to_dir=$(git rev-parse HEAD) &&
+
+	>d-f.c &&
+	git add d-f.c &&
+	git commit -m "add a file that sorts between d-f and d-f/" &&
+	git rm -r d-f &&
+	echo 3 >d-f &&
+	git add d-f &&
+	git commit -m "replace dir with file" &&
+	dir_to_file=$(git rev-parse HEAD) &&
+
+	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
+	# leading zero in mode: the content of the tree remains the same,
+	# but its oid does change!
+	printf "040000 subdir\0$bin_sha1" >rawtree &&
+	tree1=$(git hash-object -t tree -w rawtree) &&
+	git cat-file -p HEAD^{tree} >out &&
+	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
+	different_but_same_tree=$(git commit-tree \
+		-m "leading zeros in mode" \
+		-p $(git rev-parse HEAD) $tree2) &&
+	git update-ref HEAD $different_but_same_tree &&
+
+	git commit-graph write --reachable --changed-paths >out &&
+	cat out  # debug
+'
+
+test_expect_success 'modify one path in root' '
+	git diff --name-status $mod_one_path^ $mod_one_path &&
+	echo "$mod_one_path  1" >expect &&
+	grep "$mod_one_path" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'modify two file two dirs deep' '
+	git diff --name-status $mod_four_paths^ $mod_four_paths &&
+	echo "$mod_four_paths  2" >expect &&
+	grep "$mod_four_paths" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in root' '
+	git diff --name-status $new_file_in_root^ $new_file_in_root &&
+	echo "$new_file_in_root  1" >expect &&
+	grep "$new_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in root' '
+	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
+	echo "$delete_file_in_root  1" >expect &&
+	grep "$delete_file_in_root" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'add new file in dir' '
+	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
+	echo "$new_file_in_dir  1" >expect &&
+	grep "$new_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'delete file in dir' '
+	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
+	echo "$delete_file_in_dir  1" >expect &&
+	grep "$delete_file_in_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace file with dir' '
+	git diff --name-status $file_to_dir^ $file_to_dir &&
+	echo "$file_to_dir  2" >expect &&
+	grep "$file_to_dir" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'replace dir with file' '
+	git diff --name-status $dir_to_file^ $dir_to_file &&
+	echo "$dir_to_file  2" >expect &&
+	grep "$dir_to_file" out >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'leading zeros in mode' '
+	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
+	echo "$different_but_same_tree  0" >expect &&
+	grep "$different_but_same_tree" out >actual &&
+	test_cmp expect actual
+'
+
+test_done
diff --git a/tree-diff.c b/tree-diff.c
index 6ebad1a46f3..7cebbb327e2 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,7 +434,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
-		if (opt->max_changes && opt->num_changes > opt->max_changes)
+		if (opt->max_changes && diff_queued_diff.nr > opt->max_changes)
 			break;
 
 		if (opt->pathspec.nr) {
@@ -521,7 +521,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
-			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -539,7 +538,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
-			opt->num_changes++;
 		}
 	}
 
@@ -557,7 +555,6 @@ struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
-	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux