Teach the patience diff to attempt preventing user-specified lines from appearing as a deletion or addition in the end result. The end user can use this by specifying "--anchor=<text>" one or more times when using Git commands like "diff" and "show". Signed-off-by: Jonathan Tan <jonathantanmy@xxxxxxxxxx> --- Actual patch instead of RFC. One thing that might help is to warn if --anchor is used without --patience, but I couldn't find a good place to put that warning. Let me know if you know of a good place. Replying to Stefan's and Junio's comments: > The solution you provide is a good thing to experiment with, but > longer term, I would want to have huge record of configs in which > humans selected the best diff, such that we can use that data > to reason about better automatic diff generation. > The diff heuristic was based on a lot of human generated data, > that was generated by Michael at the time. I wonder if we want to > permanently store the anchor so the data collection will happen > automatically over time. I think machine learning is beyond the scope of this patch :-) > or rather: "c is not moved, we don't care how the diff actually looks > like", > so maybe > ! grep "+c" diff I think it's less error-prone to show "a" moving. With this, if the command somehow prints nothing, the test would still pass. > While this is RFC, I should not expect comments, though it would > be nice to have them in the final series. ;-) Added comments :-) > This is a natural extension of the idea the patience algorithm is > built upon. If this were a cumulative command line option that can > be given to specify multiple lines and can be used across the diff > family, it would make a welcome addition, I would think. Specifying multiple lines - done. By diff family, do you mean "diff", "show", and others? If yes, that has also been done in this patch. --- Documentation/diff-options.txt | 9 +++++++ diff.c | 8 ++++++ diff.h | 4 +++ t/t4033-diff-patience.sh | 59 ++++++++++++++++++++++++++++++++++++++++++ xdiff/xdiff.h | 4 +++ xdiff/xpatience.c | 42 ++++++++++++++++++++++++++---- 6 files changed, 121 insertions(+), 5 deletions(-) diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index dd0dba5b1..b17d62696 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -100,6 +100,15 @@ For instance, if you configured diff.algorithm variable to a non-default value and want to use the default one, then you have to use `--diff-algorithm=default` option. +--anchor=<text>:: + This option may be specified more than once. ++ +If the "patience diff" algorithm is used, Git attempts to prevent lines +starting with this text from appearing as a deletion or addition in the +output. ++ +If another diff algorithm is used, this has no effect. + --stat[=<width>[,<name-width>[,<count>]]]:: Generate a diffstat. By default, as much space as necessary will be used for the filename part, and the rest for the graph diff --git a/diff.c b/diff.c index 0763e8926..29a7176ee 100644 --- a/diff.c +++ b/diff.c @@ -3210,6 +3210,8 @@ static void builtin_diff(const char *name_a, ecbdata.opt = o; ecbdata.header = header.len ? &header : NULL; xpp.flags = o->xdl_opts; + xpp.anchors = o->anchors; + xpp.anchors_nr = o->anchors_nr; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_FUNCNAMES; @@ -3302,6 +3304,8 @@ static void builtin_diffstat(const char *name_a, const char *name_b, memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); xpp.flags = o->xdl_opts; + xpp.anchors = o->anchors; + xpp.anchors_nr = o->anchors_nr; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; if (xdi_diff_outf(&mf1, &mf2, diffstat_consume, diffstat, @@ -4608,6 +4612,10 @@ int diff_opt_parse(struct diff_options *options, options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK; options->xdl_opts |= value; return argcount; + } else if (skip_prefix(arg, "--anchor=", &arg)) { + ALLOC_GROW(options->anchors, options->anchors_nr + 1, + options->anchors_alloc); + options->anchors[options->anchors_nr++] = xstrdup(arg); } /* flags options */ diff --git a/diff.h b/diff.h index 0fb18dd73..7cf276f07 100644 --- a/diff.h +++ b/diff.h @@ -166,6 +166,10 @@ struct diff_options { const char *stat_sep; long xdl_opts; + /* see Documentation/diff-options.txt */ + char **anchors; + size_t anchors_nr, anchors_alloc; + int stat_width; int stat_name_width; int stat_graph_width; diff --git a/t/t4033-diff-patience.sh b/t/t4033-diff-patience.sh index 113304dc5..2d00d1056 100755 --- a/t/t4033-diff-patience.sh +++ b/t/t4033-diff-patience.sh @@ -13,6 +13,65 @@ test_expect_success '--ignore-space-at-eol with a single appended character' ' grep "^+.*X" diff ' +test_expect_success '--anchor' ' + printf "a\nb\nc\n" >pre && + printf "c\na\nb\n" >post && + + # without anchor, c is moved + test_expect_code 1 git diff --no-index --patience pre post >diff && + grep "^+c" diff && + + # with anchor, a is moved + test_expect_code 1 git diff --no-index --patience --anchor=c pre post >diff && + grep "^+a" diff +' + +test_expect_success '--anchor multiple' ' + printf "a\nb\nc\nd\ne\nf\n" >pre && + printf "c\na\nb\nf\nd\ne\n" >post && + + # with 1 anchor, c is not moved, but f is moved + test_expect_code 1 git diff --no-index --patience --anchor=c pre post >diff && + grep "^+a" diff && # a is moved instead of c + grep "^+f" diff && + + # with 2 anchors, c and f are not moved + test_expect_code 1 git diff --no-index --patience --anchor=c --anchor=f pre post >diff && + grep "^+a" diff && + grep "^+d" diff # d is moved instead of f +' + +test_expect_success '--anchor with nonexistent line has no effect' ' + printf "a\nb\nc\n" >pre && + printf "c\na\nb\n" >post && + + test_expect_code 1 git diff --no-index --patience --anchor=x pre post >diff && + grep "^+c" diff +' + +test_expect_success 'diff still produced with impossible multiple --anchor' ' + printf "a\nb\nc\n" >pre && + printf "c\na\nb\n" >post && + + test_expect_code 1 git diff --no-index --patience --anchor=a --anchor=c pre post >diff && + mv post expected_post && + git apply diff && + diff expected_post post # make sure that the diff is correct +' + +test_expect_success '--anchor works with other commands like "git show"' ' + printf "a\nb\nc\n" >file && + git add file && + git commit -m foo && + printf "c\na\nb\n" >file && + git add file && + git commit -m foo && + + # with anchor, a is moved + git show --patience --anchor=c >diff && + grep "^+a" diff +' + test_diff_frobnitz "patience" test_diff_unique "patience" diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index 785ecb089..e6217e1d7 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -80,6 +80,10 @@ typedef struct s_mmbuffer { typedef struct s_xpparam { unsigned long flags; + + /* See Documentation/diff-options.txt. */ + char **anchors; + size_t anchors_nr; } xpparam_t; typedef struct s_xdemitcb { diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index a44e77632..f55c9fb11 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -62,6 +62,12 @@ struct hashmap { * initially, "next" reflects only the order in file1. */ struct entry *next, *previous; + + /* + * If 1, this entry can serve as the anchor. See + * Documentation/diff-options.txt for more information. + */ + unsigned anchor : 1; } *entries, *first, *last; /* were common records found? */ unsigned long has_matches; @@ -70,8 +76,19 @@ struct hashmap { xpparam_t const *xpp; }; +static int is_anchor(xpparam_t const *xpp, const char *line) +{ + int i; + for (i = 0; i < xpp->anchors_nr; i++) { + if (!strncmp(line, xpp->anchors[i], strlen(xpp->anchors[i]))) + return 1; + } + return 0; +} + /* The argument "pass" is 1 for the first file, 2 for the second. */ -static void insert_record(int line, struct hashmap *map, int pass) +static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map, + int pass) { xrecord_t **records = pass == 1 ? map->env->xdf1.recs : map->env->xdf2.recs; @@ -110,6 +127,7 @@ static void insert_record(int line, struct hashmap *map, int pass) return; map->entries[index].line1 = line; map->entries[index].hash = record->ha; + map->entries[index].anchor = is_anchor(xpp, map->env->xdf1.recs[line - 1]->ptr); if (!map->first) map->first = map->entries + index; if (map->last) { @@ -147,11 +165,11 @@ static int fill_hashmap(mmfile_t *file1, mmfile_t *file2, /* First, fill with entries from the first file */ while (count1--) - insert_record(line1++, result, 1); + insert_record(xpp, line1++, result, 1); /* Then search for matches in the second file */ while (count2--) - insert_record(line2++, result, 2); + insert_record(xpp, line2++, result, 2); return 0; } @@ -192,14 +210,28 @@ static struct entry *find_longest_common_sequence(struct hashmap *map) int longest = 0, i; struct entry *entry; + /* + * If not -1, this entry in sequence must never be overridden. + * Therefore, overriding entries before this has no effect, so + * do not do that either. + */ + int anchor_i = -1; + for (entry = map->first; entry; entry = entry->next) { if (!entry->line2 || entry->line2 == NON_UNIQUE) continue; i = binary_search(sequence, longest, entry); entry->previous = i < 0 ? NULL : sequence[i]; - sequence[++i] = entry; - if (i == longest) + ++i; + if (i <= anchor_i) + continue; + sequence[i] = entry; + if (entry->anchor) { + anchor_i = i; + longest = anchor_i + 1; + } else if (i == longest) { longest++; + } } /* No common unique lines were found */ -- 2.15.0.448.gf294e3d99a-goog