On Tue, Aug 04, 2020 at 12:25:45PM -0400, Derrick Stolee wrote: > On 8/4/2020 10:47 AM, SZEDER Gábor wrote: > > On Mon, Apr 06, 2020 at 04:59:45PM +0000, Derrick Stolee via GitGitGadget wrote: > > This counter is basically broken, its value is wrong for over 98% of > > commits, and, worse, its value remains 0 for over 85% of commits in > > the repositories I usually use to test modified path Bloom filters. > > Consequently, a relatively large number of commits modifying more than > > 512 paths get Bloom filters. > > Thanks for finding this! The counter is only really tested in one > place, and that test only considers _file adds_, which is a problem. > > If I understand this correctly, the bug is a performance-only bug > (since this is a performance-only feature), but it is an important > one to fix. Or a performance-only feature in a performance-only feature, because those additional modified path Bloom filters can improve the runtime of pathspec-limited revision walks (assuming that the false positive rate is low enough). > There is certainly some dark magic happening in this tree-diff logic, > so instead of trying to get an accurate count we should just use the > magic global diff_queued_diff to track the current list of file changes. > > Note: diff_queued_diff does not track the directory changes, so it > is an under-count for the total changes to track in the Bloom filter. > This is later corrected by the block that adds these leading directory > changes. > > > The makeshift tests in the patch below demonstrate these issues as > > most of them fail, most notably those two tests that demonstrate that > > modifying existing paths are not counted at all. > > I adapted your diff along with ripping out 'num_changes' in favor > of diff_queued_diff.nr. This required modifying some of your expected > values in the test script (losing the leading directories in the > count). > > I'll work with Taylor to create a fix, and include proper testing > of the logic here. We'll stick it in the v2 of his max-changed-paths > series [1]. He already has some helpful logging that can help create > tests that ensure this logic is performing as expected. Don't forget to include a check of the hashmap's size, to make sure. FWIW, the patch below does result in the correct count (read: the same as in my implemenation) for all but 4 commits in those repositories I use for testing, without adding any memory allocations and extra strcmp() calls. --- >8 --- diff --git a/cache.h b/cache.h index 0f0485ecfe..3fc7e1b427 100644 --- a/cache.h +++ b/cache.h @@ -1574,6 +1574,7 @@ int repo_interpret_branch_name(struct repository *r, int validate_headref(const char *ref); int base_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2); +int base_name_compare_df(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2, int *df); int df_name_compare(const char *name1, int len1, int mode1, const char *name2, int len2, int mode2); int name_compare(const char *name1, size_t len1, const char *name2, size_t len2); int cache_name_stage_compare(const char *name1, int len1, int stage1, const char *name2, int len2, int stage2); diff --git a/read-cache.c b/read-cache.c index aa427c5c17..041af19e60 100644 --- a/read-cache.c +++ b/read-cache.c @@ -460,13 +460,16 @@ int ie_modified(struct index_state *istate, return 0; } -int base_name_compare(const char *name1, int len1, int mode1, - const char *name2, int len2, int mode2) +int base_name_compare_df(const char *name1, int len1, int mode1, + const char *name2, int len2, int mode2, + int *df) { unsigned char c1, c2; int len = len1 < len2 ? len1 : len2; int cmp; + *df = 0; + cmp = memcmp(name1, name2, len); if (cmp) return cmp; @@ -476,7 +479,21 @@ int base_name_compare(const char *name1, int len1, int mode1, c1 = '/'; if (!c2 && S_ISDIR(mode2)) c2 = '/'; - return (c1 < c2) ? -1 : (c1 > c2) ? 1 : 0; + if (c1 == c2) + return 0; /* TODO: is this even possible? */ + if ((c1 == '/' && !c2) || + (!c1 && c2 == '/')) + *df = 1; + return (c1 < c2) ? -1 : 1; +} + +int base_name_compare(const char *name1, int len1, int mode1, + const char *name2, int len2, int mode2) +{ + int unused; + return base_name_compare_df(name1, len1, mode1, + name2, len2, mode2, + &unused); } /* diff --git a/t/t9999-test.sh b/t/t9999-test.sh index 8d2bd9f03f..4f08590b45 100755 --- a/t/t9999-test.sh +++ b/t/t9999-test.sh @@ -125,7 +125,7 @@ test_expect_success 'replace file with dir' ' test_cmp expect actual ' -test_expect_success 'replace dir with file' ' +test_expect_failure 'replace dir with file' ' git diff --name-status $dir_to_file^ $dir_to_file && echo "$dir_to_file 2" >expect && grep "$dir_to_file" out >actual && diff --git a/tree-diff.c b/tree-diff.c index f3d303c6e5..e27f9c805e 100644 --- a/tree-diff.c +++ b/tree-diff.c @@ -46,11 +46,14 @@ static int ll_diff_tree_oid(const struct object_id *old_oid, * Due to this convention, if trees are scanned in sorted order, all * non-empty descriptors will be processed first. */ -static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2) +static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2, + int *df) { struct name_entry *e1, *e2; int cmp; + *df = 0; + /* empty descriptors sort after valid tree entries */ if (!t1->size) return t2->size ? 1 : 0; @@ -59,8 +62,9 @@ static int tree_entry_pathcmp(struct tree_desc *t1, struct tree_desc *t2) e1 = &t1->entry; e2 = &t2->entry; - cmp = base_name_compare(e1->path, tree_entry_len(e1), e1->mode, - e2->path, tree_entry_len(e2), e2->mode); + cmp = base_name_compare_df(e1->path, tree_entry_len(e1), e1->mode, + e2->path, tree_entry_len(e2), e2->mode, + df); return cmp; } @@ -410,7 +414,7 @@ static struct combine_diff_path *ll_diff_tree_paths( { struct tree_desc t, *tp; void *ttree, **tptree; - int i; + int i, df; FAST_ARRAY_ALLOC(tp, nparent); FAST_ARRAY_ALLOC(tptree, nparent); @@ -463,7 +467,7 @@ static struct combine_diff_path *ll_diff_tree_paths( tp[0].entry.mode &= ~S_IFXMIN_NEQ; for (i = 1; i < nparent; ++i) { - cmp = tree_entry_pathcmp(&tp[i], &tp[imin]); + cmp = tree_entry_pathcmp(&tp[i], &tp[imin], &df); if (cmp < 0) { imin = i; tp[i].entry.mode &= ~S_IFXMIN_NEQ; @@ -483,10 +487,12 @@ static struct combine_diff_path *ll_diff_tree_paths( /* compare t vs p[imin] */ - cmp = tree_entry_pathcmp(&t, &tp[imin]); + cmp = tree_entry_pathcmp(&t, &tp[imin], &df); /* t = p[imin] */ if (cmp == 0) { + int prev_num_changes = opt->num_changes; + /* are either pi > p[imin] or diff(t,pi) != ø ? */ if (!opt->flags.find_copies_harder) { for (i = 0; i < nparent; ++i) { @@ -506,6 +512,9 @@ static struct combine_diff_path *ll_diff_tree_paths( /* D += {δ(t,pi) if pi=p[imin]; "+a" if pi > p[imin]} */ p = emit_path(p, base, opt, nparent, &t, tp, imin); + if (!(opt->num_changes == prev_num_changes && + S_ISDIR(t.entry.mode))) + opt->num_changes++; skip_emit_t_tp: /* t↓, ∀ pi=p[imin] pi↓ */ @@ -518,10 +527,11 @@ static struct combine_diff_path *ll_diff_tree_paths( /* D += "+t" */ p = emit_path(p, base, opt, nparent, &t, /*tp=*/NULL, -1); + if (!df) + opt->num_changes++; /* t↓ */ update_tree_entry(&t); - opt->num_changes++; } /* t > p[imin] */ @@ -535,11 +545,12 @@ static struct combine_diff_path *ll_diff_tree_paths( p = emit_path(p, base, opt, nparent, /*t=*/NULL, tp, imin); + if (!df) + opt->num_changes++; skip_emit_tp: /* ∀ pi=p[imin] pi↓ */ update_tp_entries(tp, nparent); - opt->num_changes++; } } --- >8 --- Having said that, the best (i.e faster and accurate) solution to this issue is probably: - Update the callchain between diff_tree_oid() and the diff callback functions to allow the callbacks to break diffing with a non-zero error code. - Fill Bloom filters using the approach presented in: https://public-inbox.org/git/20200529085038.26008-21-szeder.dev@xxxxxxxxx/ but modify the callbacks to return non-zero when too many paths have been processed. - Drop this counter entirely, as there are no other users. > We plan to have that fix available by later today or early tomorrow. > Will you be available to help validate it? > > [1] https://lore.kernel.org/git/cover.1596480582.git.me@xxxxxxxxxxxx/ > > Thanks, > -Stolee > > --- >8 --- > > diff --git a/bloom.c b/bloom.c > index 1a573226e7..b8d6cb9240 100644 > --- a/bloom.c > +++ b/bloom.c > @@ -218,8 +218,9 @@ struct bloom_filter *get_bloom_filter(struct repository *r, > else > diff_tree_oid(NULL, &c->object.oid, "", &diffopt); > diffcore_std(&diffopt); > + printf("%s %d\n", oid_to_hex(&c->object.oid), diff_queued_diff.nr); > > - if (diffopt.num_changes <= max_changes) { > + if (diff_queued_diff.nr <= max_changes) { > struct hashmap pathmap; > struct pathmap_hash_entry *e; > struct hashmap_iter iter; > diff --git a/diff.h b/diff.h > index e0c0af6286b..1d32b718857 100644 > --- a/diff.h > +++ b/diff.h > @@ -287,8 +287,6 @@ struct diff_options { > > /* If non-zero, then stop computing after this many changes. */ > int max_changes; > - /* For internal use only. */ > - int num_changes; > > int ita_invisible_in_index; > /* white-space error highlighting */ > diff --git a/t/t9999-test.sh b/t/t9999-test.sh > new file mode 100755 > index 00000000000..1f35aa8e2c5 > --- /dev/null > +++ b/t/t9999-test.sh > @@ -0,0 +1,142 @@ > +#!/bin/sh > + > +test_description='test' > + > +. ./test-lib.sh > + > +test_expect_success 'setup' ' > + test_tick && > + > + echo 1 >file && > + mkdir -p dir/subdir && > + echo 1 >dir/subdir/file1 && > + echo 1 >dir/subdir/file2 && > + git add file dir && > + git commit -m setup && > + > + echo 2 >file && > + git commit -a -m "modify one path in root" && > + mod_one_path=$(git rev-parse HEAD) && > + > + echo 2 >dir/subdir/file1 && > + echo 2 >dir/subdir/file2 && > + git commit -a -m "modify two file two dirs deep" && > + mod_four_paths=$(git rev-parse HEAD) && > + > + >new-file && > + git add new-file && > + git commit -m "add new file in root" && > + new_file_in_root=$(git rev-parse HEAD) && > + > + git rm new-file && > + git commit -m "delete file in root" && > + delete_file_in_root=$(git rev-parse HEAD) && > + > + >dir/new-file && > + git add dir/new-file && > + git commit -m "add new file in dir" && > + new_file_in_dir=$(git rev-parse HEAD) && > + > + git rm dir/new-file && > + git commit -m "delete file in dir" && > + delete_file_in_dir=$(git rev-parse HEAD) && > + > + echo 1 >d-f && > + git add d-f && > + git commit -m foo && > + git rm d-f && > + mkdir d-f && > + echo 2 >d-f/file && > + git add d-f && > + git commit -m "replace file with dir" && > + file_to_dir=$(git rev-parse HEAD) && > + > + >d-f.c && > + git add d-f.c && > + git commit -m "add a file that sorts between d-f and d-f/" && > + git rm -r d-f && > + echo 3 >d-f && > + git add d-f && > + git commit -m "replace dir with file" && > + dir_to_file=$(git rev-parse HEAD) && > + > + bin_sha1=$(git rev-parse HEAD:dir/subdir | hex2oct) && > + # leading zero in mode: the content of the tree remains the same, > + # but its oid does change! > + printf "040000 subdir\0$bin_sha1" >rawtree && > + tree1=$(git hash-object -t tree -w rawtree) && > + git cat-file -p HEAD^{tree} >out && > + tree2=$(sed -e "s/$(git rev-parse HEAD:dir/)/$tree1/" out |git mktree) && > + different_but_same_tree=$(git commit-tree \ > + -m "leading zeros in mode" \ > + -p $(git rev-parse HEAD) $tree2) && > + git update-ref HEAD $different_but_same_tree && > + > + git commit-graph write --reachable --changed-paths >out && > + cat out # debug > +' > + > +test_expect_success 'modify one path in root' ' > + git diff --name-status $mod_one_path^ $mod_one_path && > + echo "$mod_one_path 1" >expect && > + grep "$mod_one_path" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'modify two file two dirs deep' ' > + git diff --name-status $mod_four_paths^ $mod_four_paths && > + echo "$mod_four_paths 2" >expect && > + grep "$mod_four_paths" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'add new file in root' ' > + git diff --name-status $new_file_in_root^ $new_file_in_root && > + echo "$new_file_in_root 1" >expect && > + grep "$new_file_in_root" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'delete file in root' ' > + git diff --name-status $delete_file_in_root^ $delete_file_in_root && > + echo "$delete_file_in_root 1" >expect && > + grep "$delete_file_in_root" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'add new file in dir' ' > + git diff --name-status $new_file_in_dir^ $new_file_in_dir && > + echo "$new_file_in_dir 1" >expect && > + grep "$new_file_in_dir" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'delete file in dir' ' > + git diff --name-status $delete_file_in_dir^ $delete_file_in_dir && > + echo "$delete_file_in_dir 1" >expect && > + grep "$delete_file_in_dir" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'replace file with dir' ' > + git diff --name-status $file_to_dir^ $file_to_dir && > + echo "$file_to_dir 2" >expect && > + grep "$file_to_dir" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'replace dir with file' ' > + git diff --name-status $dir_to_file^ $dir_to_file && > + echo "$dir_to_file 2" >expect && > + grep "$dir_to_file" out >actual && > + test_cmp expect actual > +' > + > +test_expect_success 'leading zeros in mode' ' > + git diff --name-status $different_but_same_tree^ $different_but_same_tree && > + echo "$different_but_same_tree 0" >expect && > + grep "$different_but_same_tree" out >actual && > + test_cmp expect actual > +' > + > +test_done > diff --git a/tree-diff.c b/tree-diff.c > index 6ebad1a46f3..7cebbb327e2 100644 > --- a/tree-diff.c > +++ b/tree-diff.c > @@ -434,7 +434,7 @@ static struct combine_diff_path *ll_diff_tree_paths( > if (diff_can_quit_early(opt)) > break; > > - if (opt->max_changes && opt->num_changes > opt->max_changes) > + if (opt->max_changes && diff_queued_diff.nr > opt->max_changes) > break; > > if (opt->pathspec.nr) { > @@ -521,7 +521,6 @@ static struct combine_diff_path *ll_diff_tree_paths( > > /* t↓ */ > update_tree_entry(&t); > - opt->num_changes++; > } > > /* t > p[imin] */ > @@ -539,7 +538,6 @@ static struct combine_diff_path *ll_diff_tree_paths( > skip_emit_tp: > /* ∀ pi=p[imin] pi↓ */ > update_tp_entries(tp, nparent); > - opt->num_changes++; > } > } > > @@ -557,7 +555,6 @@ struct combine_diff_path *diff_tree_paths( > const struct object_id **parents_oid, int nparent, > struct strbuf *base, struct diff_options *opt) > { > - opt->num_changes = 0; > p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt); > > /*