Re: [PATCH v4 05/15] diff: halt tree-diff early after max_changes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 04, 2020 at 12:25:45PM -0400, Derrick Stolee wrote:
> On 8/4/2020 10:47 AM, SZEDER Gábor wrote:
> > On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote:
> > This counter is basically broken, its value is wrong for over 98% of
> > commits, and, worse, its value remains 0 for over 85% of commits in
> > the repositories I usually use to test modified path Bloom filters.
> > Consequently, a relatively large number of commits modifying more than
> > 512 paths get Bloom filters.
> 
> Thanks for finding this! The counter is only really tested in one
> place, and that test only considers _file adds_, which is a problem.
> 
> If I understand this correctly, the bug is a performance-only bug
> (since this is a performance-only feature), but it is an important
> one to fix.

Or a performance-only feature in a performance-only feature, because
those additional modified path Bloom filters can improve the runtime
of pathspec-limited revision walks (assuming that the false positive
rate is low enough).

> There is certainly some dark magic happening in this tree-diff logic,
> so instead of trying to get an accurate count we should just use the
> magic global diff_queued_diff to track the current list of file changes.
> 
> Note: diff_queued_diff does not track the directory changes, so it
> is an under-count for the total changes to track in the Bloom filter.
> This is later corrected by the block that adds these leading directory
> changes.
> 
> > The makeshift tests in the patch below demonstrate these issues as
> > most of them fail, most notably those two tests that demonstrate that
> > modifying existing paths are not counted at all.
> 
> I adapted your diff along with ripping out 'num_changes' in favor
> of diff_queued_diff.nr. This required modifying some of your expected
> values in the test script (losing the leading directories in the
> count).
> 
> I'll work with Taylor to create a fix, and include proper testing
> of the logic here. We'll stick it in the v2 of his max-changed-paths
> series [1]. He already has some helpful logging that can help create
> tests that ensure this logic is performing as expected.

Don't forget to include a check of the hashmap's size, to make sure.

FWIW, the patch below does result in the correct count (read: the same
as in my implemenation) for all but 4 commits in those repositories I
use for testing, without adding any memory allocations and extra
strcmp() calls.

  ---  >8  ---

diff --git a/cache.h b/cache.h
index 0f0485ecfe..3fc7e1b427 100644
--- a/cache.h
+++ b/cache.h
@@ -1574,6 +1574,7 @@ int repo_interpret_branch_name(struct repository *r,
 int validate_headref(const char *ref);
 
 int base_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2);
+int base_name_compare_df(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2, int *df);
 int df_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2);
 int name_compare(const char *name1, size_t len1, const char *name2, size_t len2);
 int cache_name_stage_compare(const char *name1, int len1, int stage1, const char *name2, int len2, int stage2);
diff --git a/read-cache.c b/read-cache.c
index aa427c5c17..041af19e60 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -460,13 +460,16 @@ int ie_modified(struct index_state *istate,
 	return 0;
 }
 
-int base_name_compare(const char *name1, int len1, int mode1,
-		      const char *name2, int len2, int mode2)
+int base_name_compare_df(const char *name1, int len1, int mode1,
+			 const char *name2, int len2, int mode2,
+			 int *df)
 {
 	unsigned char c1, c2;
 	int len = len1 < len2 ? len1 : len2;
 	int cmp;
 
+	*df = 0;
+
 	cmp = memcmp(name1, name2, len);
 	if (cmp)
 		return cmp;
@@ -476,7 +479,21 @@ int base_name_compare(const char *name1, int len1, int mode1,
 		c1 = '/';
 	if (!c2 && S_ISDIR(mode2))
 		c2 = '/';
-	return (c1 < c2) ? -1 : (c1 > c2) ? 1 : 0;
+	if (c1 == c2)
+		return 0;	/* TODO: is this even possible? */
+	if ((c1 == '/' && !c2) ||
+	    (!c1 && c2 == '/'))
+		*df = 1;
+	return (c1 < c2) ? -1 : 1;
+}
+
+int base_name_compare(const char *name1, int len1, int mode1,
+		      const char *name2, int len2, int mode2)
+{
+	int unused;
+	return base_name_compare_df(name1, len1, mode1,
+				    name2, len2, mode2,
+				    &unused);
 }
 
 /*
diff --git a/t/t9999-test.sh b/t/t9999-test.sh
index 8d2bd9f03f..4f08590b45 100755
--- a/t/t9999-test.sh
+++ b/t/t9999-test.sh
@@ -125,7 +125,7 @@ test_expect_success 'replace file with dir' '
 	test_cmp expect actual
 '
 
-test_expect_success 'replace dir with file' '
+test_expect_failure 'replace dir with file' '
 	git diff --name-status $dir_to_file^ $dir_to_file &&
 	echo "$dir_to_file  2" >expect &&
 	grep "$dir_to_file" out >actual &&
diff --git a/tree-diff.c b/tree-diff.c
index f3d303c6e5..e27f9c805e 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -46,11 +46,14 @@ static int ll_diff_tree_oid(const struct object_id *old_oid,
  *      Due to this convention, if trees are scanned in sorted order, all
  *      non-empty descriptors will be processed first.
  */
-static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2)
+static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2,
+			      int *df)
 {
 	struct name_entry *e1, *e2;
 	int cmp;
 
+	*df = 0;
+
 	/* empty descriptors sort after valid tree entries */
 	if (!t1->size)
 		return t2->size ? 1 : 0;
@@ -59,8 +62,9 @@ static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2)
 
 	e1 = &t1->entry;
 	e2 = &t2->entry;
-	cmp = base_name_compare(e1->path, tree_entry_len(e1), e1->mode,
-				e2->path, tree_entry_len(e2), e2->mode);
+	cmp = base_name_compare_df(e1->path, tree_entry_len(e1), e1->mode,
+				   e2->path, tree_entry_len(e2), e2->mode,
+				   df);
 	return cmp;
 }
 
@@ -410,7 +414,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 {
 	struct tree_desc t, *tp;
 	void *ttree, **tptree;
-	int i;
+	int i, df;
 
 	FAST_ARRAY_ALLOC(tp, nparent);
 	FAST_ARRAY_ALLOC(tptree, nparent);
@@ -463,7 +467,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		tp[0].entry.mode &= ~S_IFXMIN_NEQ;
 
 		for (i = 1; i < nparent; ++i) {
-			cmp = tree_entry_pathcmp(&tp[i], &tp[imin]);
+			cmp = tree_entry_pathcmp(&tp[i], &tp[imin], &df);
 			if (cmp < 0) {
 				imin = i;
 				tp[i].entry.mode &= ~S_IFXMIN_NEQ;
@@ -483,10 +487,12 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 
 		/* compare t vs p[imin] */
-		cmp = tree_entry_pathcmp(&t, &tp[imin]);
+		cmp = tree_entry_pathcmp(&t, &tp[imin], &df);
 
 		/* t = p[imin] */
 		if (cmp == 0) {
+			int prev_num_changes = opt->num_changes;
+
 			/* are either pi > p[imin] or diff(t,pi) != ø ? */
 			if (!opt->flags.find_copies_harder) {
 				for (i = 0; i < nparent; ++i) {
@@ -506,6 +512,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
 			/* D += {δ(t,pi) if pi=p[imin];  "+a" if pi > p[imin]} */
 			p = emit_path(p, base, opt, nparent,
 					&t, tp, imin);
+			if (!(opt->num_changes == prev_num_changes &&
+			      S_ISDIR(t.entry.mode)))
+				opt->num_changes++;
 
 		skip_emit_t_tp:
 			/* t↓,  ∀ pi=p[imin]  pi↓ */
@@ -518,10 +527,11 @@ static struct combine_diff_path *ll_diff_tree_paths(
 			/* D += "+t" */
 			p = emit_path(p, base, opt, nparent,
 					&t, /*tp=*/NULL, -1);
+			if (!df)
+				opt->num_changes++;
 
 			/* t↓ */
 			update_tree_entry(&t);
-			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,11 +545,12 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 			p = emit_path(p, base, opt, nparent,
 					/*t=*/NULL, tp, imin);
+			if (!df)
+				opt->num_changes++;
 
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
-			opt->num_changes++;
 		}
 	}
 
  ---  >8  ---


Having said that, the best (i.e faster and accurate) solution to this
issue is probably:

  - Update the callchain between diff_tree_oid() and the diff callback
    functions to allow the callbacks to break diffing with a non-zero
    error code.

  - Fill Bloom filters using the approach presented in:

      https://public-inbox.org/git/20200529085038.26008-21-szeder.dev@xxxxxxxxx/

    but modify the callbacks to return non-zero when too many paths
    have been processed.

  - Drop this counter entirely, as there are no other users.

> We plan to have that fix available by later today or early tomorrow.
> Will you be available to help validate it?
> 
> [1] https://lore.kernel.org/git/cover.1596480582.git.me@xxxxxxxxxxxx/
> 
> Thanks,
> -Stolee
> 
>   --- >8 ---
> 
> diff --git a/bloom.c b/bloom.c
> index 1a573226e7..b8d6cb9240 100644
> --- a/bloom.c
> +++ b/bloom.c
> @@ -218,8 +218,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
>  	else
>  		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
>  	diffcore_std(&diffopt);
> +	printf("%s  %d\n", oid_to_hex(&c->object.oid), diff_queued_diff.nr);
>  
> -	if (diffopt.num_changes <= max_changes) {
> +	if (diff_queued_diff.nr <= max_changes) {
>  		struct hashmap pathmap;
>  		struct pathmap_hash_entry *e;
>  		struct hashmap_iter iter;
> diff --git a/diff.h b/diff.h
> index e0c0af6286b..1d32b718857 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -287,8 +287,6 @@ struct diff_options {
>  
>  	/* If non-zero, then stop computing after this many changes. */
>  	int max_changes;
> -	/* For internal use only. */
> -	int num_changes;
>  
>  	int ita_invisible_in_index;
>  /* white-space error highlighting */
> diff --git a/t/t9999-test.sh b/t/t9999-test.sh
> new file mode 100755
> index 00000000000..1f35aa8e2c5
> --- /dev/null
> +++ b/t/t9999-test.sh
> @@ -0,0 +1,142 @@
> +#!/bin/sh
> +
> +test_description='test'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'setup' '
> +	test_tick &&
> +
> +	echo 1 >file &&
> +	mkdir -p dir/subdir &&
> +	echo 1 >dir/subdir/file1 &&
> +	echo 1 >dir/subdir/file2 &&
> +	git add file dir &&
> +	git commit -m setup &&
> +
> +	echo 2 >file &&
> +	git commit -a -m "modify one path in root" &&
> +	mod_one_path=$(git rev-parse HEAD) &&
> +
> +	echo 2 >dir/subdir/file1 &&
> +	echo 2 >dir/subdir/file2 &&
> +	git commit -a -m "modify two file two dirs deep" &&
> +	mod_four_paths=$(git rev-parse HEAD) &&
> +
> +	>new-file &&
> +	git add new-file &&
> +	git commit -m "add new file in root" &&
> +	new_file_in_root=$(git rev-parse HEAD) &&
> +
> +	git rm new-file &&
> +	git commit -m "delete file in root" &&
> +	delete_file_in_root=$(git rev-parse HEAD) &&
> +
> +	>dir/new-file &&
> +	git add dir/new-file &&
> +	git commit -m "add new file in dir" &&
> +	new_file_in_dir=$(git rev-parse HEAD) &&
> +
> +	git rm dir/new-file &&
> +	git commit -m "delete file in dir" &&
> +	delete_file_in_dir=$(git rev-parse HEAD) &&
> +
> +	echo 1 >d-f &&
> +	git add d-f &&
> +	git commit -m foo &&
> +	git rm d-f &&
> +	mkdir d-f &&
> +	echo 2 >d-f/file &&
> +	git add d-f &&
> +	git commit -m "replace file with dir" &&
> +	file_to_dir=$(git rev-parse HEAD) &&
> +
> +	>d-f.c &&
> +	git add d-f.c &&
> +	git commit -m "add a file that sorts between d-f and d-f/" &&
> +	git rm -r d-f &&
> +	echo 3 >d-f &&
> +	git add d-f &&
> +	git commit -m "replace dir with file" &&
> +	dir_to_file=$(git rev-parse HEAD) &&
> +
> +	bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) &&
> +	# leading zero in mode: the content of the tree remains the same,
> +	# but its oid does change!
> +	printf "040000 subdir\0$bin_sha1" >rawtree &&
> +	tree1=$(git hash-object -t tree -w rawtree) &&
> +	git cat-file -p HEAD^{tree} >out &&
> +	tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) &&
> +	different_but_same_tree=$(git commit-tree \
> +		-m "leading zeros in mode" \
> +		-p $(git rev-parse HEAD) $tree2) &&
> +	git update-ref HEAD $different_but_same_tree &&
> +
> +	git commit-graph write --reachable --changed-paths >out &&
> +	cat out  # debug
> +'
> +
> +test_expect_success 'modify one path in root' '
> +	git diff --name-status $mod_one_path^ $mod_one_path &&
> +	echo "$mod_one_path  1" >expect &&
> +	grep "$mod_one_path" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'modify two file two dirs deep' '
> +	git diff --name-status $mod_four_paths^ $mod_four_paths &&
> +	echo "$mod_four_paths  2" >expect &&
> +	grep "$mod_four_paths" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'add new file in root' '
> +	git diff --name-status $new_file_in_root^ $new_file_in_root &&
> +	echo "$new_file_in_root  1" >expect &&
> +	grep "$new_file_in_root" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'delete file in root' '
> +	git diff --name-status $delete_file_in_root^ $delete_file_in_root &&
> +	echo "$delete_file_in_root  1" >expect &&
> +	grep "$delete_file_in_root" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'add new file in dir' '
> +	git diff --name-status $new_file_in_dir^ $new_file_in_dir &&
> +	echo "$new_file_in_dir  1" >expect &&
> +	grep "$new_file_in_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'delete file in dir' '
> +	git diff --name-status $delete_file_in_dir^ $delete_file_in_dir &&
> +	echo "$delete_file_in_dir  1" >expect &&
> +	grep "$delete_file_in_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'replace file with dir' '
> +	git diff --name-status $file_to_dir^ $file_to_dir &&
> +	echo "$file_to_dir  2" >expect &&
> +	grep "$file_to_dir" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'replace dir with file' '
> +	git diff --name-status $dir_to_file^ $dir_to_file &&
> +	echo "$dir_to_file  2" >expect &&
> +	grep "$dir_to_file" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_expect_success 'leading zeros in mode' '
> +	git diff --name-status $different_but_same_tree^ $different_but_same_tree &&
> +	echo "$different_but_same_tree  0" >expect &&
> +	grep "$different_but_same_tree" out >actual &&
> +	test_cmp expect actual
> +'
> +
> +test_done
> diff --git a/tree-diff.c b/tree-diff.c
> index 6ebad1a46f3..7cebbb327e2 100644
> --- a/tree-diff.c
> +++ b/tree-diff.c
> @@ -434,7 +434,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		if (diff_can_quit_early(opt))
>  			break;
>  
> -		if (opt->max_changes && opt->num_changes > opt->max_changes)
> +		if (opt->max_changes && diff_queued_diff.nr > opt->max_changes)
>  			break;
>  
>  		if (opt->pathspec.nr) {
> @@ -521,7 +521,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  
>  			/* t↓ */
>  			update_tree_entry(&t);
> -			opt->num_changes++;
>  		}
>  
>  		/* t > p[imin] */
> @@ -539,7 +538,6 @@ static struct combine_diff_path *ll_diff_tree_paths(
>  		skip_emit_tp:
>  			/* ∀ pi=p[imin]  pi↓ */
>  			update_tp_entries(tp, nparent);
> -			opt->num_changes++;
>  		}
>  	}
>  
> @@ -557,7 +555,6 @@ struct combine_diff_path *diff_tree_paths(
>  	const struct object_id **parents_oid, int nparent,
>  	struct strbuf *base, struct diff_options *opt)
>  {
> -	opt->num_changes = 0;
>  	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
>  
>  	/*



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux