From: Torsten Bögershausen <tboegi@xxxxxx> When blobs are encoded in UTF-16, `git diff` will treat them as binary. Make it possible to show a user readable diff encoded in UTF-8. This allows to run git diff and feed the into a web sever. Improve Git to look at the "encodig" attribute and to reencode the content into UTF-8 before running the diff itself. Signed-off-by: Torsten Bögershausen <tboegi@xxxxxx> --- Documentation/diff-options.txt | 4 ++ Documentation/gitattributes.txt | 9 +++++ convert.c | 40 +++++++++++++++++++ convert.h | 2 + diff.c | 38 ++++++++++++++++-- diff.h | 1 + diffcore.h | 3 ++ t/t4066-diff-encoding.sh | 86 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 180 insertions(+), 3 deletions(-) create mode 100755 t/t4066-diff-encoding.sh diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 9d1586b956..bf2f115f11 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -629,6 +629,10 @@ endif::git-format-patch[] linkgit:git-log[1], but not for linkgit:git-format-patch[1] or diff plumbing commands. +--UTF-8:: + Git converts the content into UTF-8 before running the diff when the + "encoding" attribute is defined. See linkgit:gitattributes[5] + --ignore-submodules[=<when>]:: Ignore changes to submodules in the diff generation. <when> can be either "none", "untracked", "dirty" or "all", which is the default. diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index 30687de81a..753a7c39b7 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -881,6 +881,15 @@ advantages to choosing this method: 3. Caching. Textconv caching can speed up repeated diffs, such as those you might trigger by running `git log -p`. +Running diff on UTF-16 encoded files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Git can convert UTF-16 encoded into UTF-8 before they are feed +into the diff machinery: `diff --UTF-8 file.xxx`. + +------------------------ +file.xxx encoding=UTF-16 +------------------------ Marking files as binary ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/convert.c b/convert.c index 5efcc3b73b..45577ce504 100644 --- a/convert.c +++ b/convert.c @@ -7,6 +7,7 @@ #include "sigchain.h" #include "pkt-line.h" #include "sub-process.h" +#include "utf8.h" /* * convert.c - convert a file when checking it out and checking it in. @@ -734,6 +735,34 @@ static struct convert_driver { int required; } *user_convert, **user_convert_tail; +const char *get_encoding_attr(const char *path) +{ + static struct attr_check *check; + if (!check) + check = attr_check_initl("encoding", NULL); + if (!git_check_attr(path, check)) { + struct attr_check_item *ccheck = check->items; + const char *value; + value = ccheck->value; + if (ATTR_UNSET(value)) + return NULL; + return value; + } + return NULL; +} + +static int reencode_into_strbuf(const char *path, const char *src, size_t len, + struct strbuf *dst, const char *encoding) +{ + int outsz = 0; + char *buf; + buf = reencode_string_len(src, (int)len, "UTF-8", encoding, &outsz); + if (!buf) + return 0; + strbuf_attach(dst, buf, outsz, outsz); + return SAFE_CRLF_REENCODE; +} + static int apply_filter(const char *path, const char *src, size_t len, int fd, struct strbuf *dst, struct convert_driver *drv, const unsigned int wanted_capability, @@ -1136,6 +1165,17 @@ int convert_to_git(const struct index_state *istate, convert_attrs(&ca, path); + if (checksafe & SAFE_CRLF_REENCODE) { + const char *encoding = get_encoding_attr(path); + if (encoding) { + ret |= reencode_into_strbuf(path, src, len, dst, + encoding); + if (ret && dst) { + src = dst->buf; + len = dst->len; + } + } + } ret |= apply_filter(path, src, len, -1, dst, ca.drv, CAP_CLEAN, NULL); if (!ret && ca.drv && ca.drv->required) die("%s: clean filter '%s' failed", path, ca.drv->name); diff --git a/convert.h b/convert.h index 532af00423..0b093715c9 100644 --- a/convert.h +++ b/convert.h @@ -13,6 +13,7 @@ struct index_state; #define SAFE_CRLF_WARN (1<<1) #define SAFE_CRLF_RENORMALIZE (1<<2) #define SAFE_CRLF_KEEP_CRLF (1<<3) +#define SAFE_CRLF_REENCODE (1<<4) extern int safe_crlf; @@ -60,6 +61,7 @@ extern const char *get_cached_convert_stats_ascii(const struct index_state *ista const char *path); extern const char *get_wt_convert_stats_ascii(const char *path); extern const char *get_convert_attr_ascii(const char *path); +extern const char *get_encoding_attr(const char *path); /* returns 1 if *dst was used */ extern int convert_to_git(const struct index_state *istate, diff --git a/diff.c b/diff.c index 5e3aaea6e0..07480a465c 100644 --- a/diff.c +++ b/diff.c @@ -3191,6 +3191,12 @@ static void builtin_diff(const char *name_a, header.buf, header.len, 0); strbuf_reset(&header); } + if (one && one->reencoded_to_utf8) + strbuf_addf(&header, "a is converted to UTF-8 from %s\n", + get_encoding_attr(one->path)); + if (two && two->reencoded_to_utf8) + strbuf_addf(&header, "b is converted to UTF-8 from %s\n", + get_encoding_attr(two->path)); mf1.size = fill_textconv(textconv_one, one, &mf1.ptr); mf2.size = fill_textconv(textconv_two, two, &mf2.ptr); @@ -3520,6 +3526,7 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) { int size_only = flags & CHECK_SIZE_ONLY; int err = 0; + int ret = 0; /* * demote FAIL to WARN to allow inspecting the situation * instead of refusing. @@ -3527,7 +3534,8 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) int checksafe = (safe_crlf == SAFE_CRLF_FAIL ? SAFE_CRLF_WARN : safe_crlf); - + if (s->reencode_to_utf8) + checksafe |= SAFE_CRLF_REENCODE; if (!DIFF_FILE_VALID(s)) die("internal error: asking to populate invalid file."); if (S_ISDIR(s->mode)) @@ -3603,17 +3611,22 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) /* * Convert from working tree format to canonical git format */ - if (convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe)) { + ret = convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe); + + if (ret) { size_t size = 0; munmap(s->data, s->size); s->should_munmap = 0; s->data = strbuf_detach(&buf, &size); s->size = size; s->should_free = 1; + if (ret & SAFE_CRLF_REENCODE) + s->reencoded_to_utf8 = 1; } } else { enum object_type type; + const char *encoding = NULL; if (size_only || (flags & CHECK_BINARY)) { type = sha1_object_info(s->oid.hash, &s->size); if (type < 0) @@ -3629,6 +3642,20 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) s->data = read_sha1_file(s->oid.hash, &type, &s->size); if (!s->data) die("unable to read %s", oid_to_hex(&s->oid)); + if (s->reencode_to_utf8) + encoding = get_encoding_attr(s->path); + if (encoding) { + int outsz = 0; + char *buf; + buf = reencode_string_len(s->data, (int)s->size, + "UTF-8", encoding, &outsz); + if (buf) { + free(s->data); + s->data = buf; + s->size = outsz; + s->reencoded_to_utf8 = 1; + } + } s->should_free = 1; } return 0; @@ -4627,7 +4654,9 @@ int diff_opt_parse(struct diff_options *options, enable_patch_output(&options->output_format); options->flags.binary = 1; } - else if (!strcmp(arg, "--full-index")) + else if (!strcmp(arg, "--UTF-8")) { + options->flags.reencode_to_utf8 = 1; + } else if (!strcmp(arg, "--full-index")) options->flags.full_index = 1; else if (!strcmp(arg, "-a") || !strcmp(arg, "--text")) options->flags.text = 1; @@ -5695,6 +5724,8 @@ static int diff_filespec_is_identical(struct diff_filespec *one, static int diff_filespec_check_stat_unmatch(struct diff_filepair *p) { + p->one->reencode_to_utf8 = p->reencode_to_utf8; + p->two->reencode_to_utf8 = p->reencode_to_utf8; if (p->done_skip_stat_unmatch) return p->skip_stat_unmatch_result; @@ -5735,6 +5766,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt) for (i = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; + p->reencode_to_utf8 = diffopt->flags.reencode_to_utf8; if (diff_filespec_check_stat_unmatch(p)) diff_q(&outq, p); else { diff --git a/diff.h b/diff.h index 7cf276f077..d2137bab58 100644 --- a/diff.h +++ b/diff.h @@ -65,6 +65,7 @@ struct diff_flags { unsigned recursive:1; unsigned tree_in_recursive:1; unsigned binary:1; + unsigned reencode_to_utf8:1; unsigned text:1; unsigned full_index:1; unsigned silent_on_remove:1; diff --git a/diffcore.h b/diffcore.h index a30da161da..2e84730778 100644 --- a/diffcore.h +++ b/diffcore.h @@ -47,6 +47,8 @@ struct diff_filespec { unsigned has_more_entries : 1; /* only appear in combined diff */ /* data should be considered "binary"; -1 means "don't know yet" */ signed int is_binary : 2; + unsigned reencode_to_utf8 : 1; + unsigned reencoded_to_utf8 : 1; struct userdiff_driver *driver; }; @@ -72,6 +74,7 @@ struct diff_filepair { unsigned is_unmerged : 1; unsigned done_skip_stat_unmatch : 1; unsigned skip_stat_unmatch_result : 1; + unsigned reencode_to_utf8 : 1; }; #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged) diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh new file mode 100755 index 0000000000..9b89253877 --- /dev/null +++ b/t/t4066-diff-encoding.sh @@ -0,0 +1,86 @@ +#!/bin/sh + +test_description='git diff with encoding attribute' + +. ./test-lib.sh + +printf '\303\244rger\n\303\266se\n\303\274bel\n' | + iconv -f UTF-8 -t UTF-16 >UTF-16 +printf '\303\266se\n\303\274bel\n\303\245gren\n' | + iconv -f UTF-8 -t UTF-16 >file2 + +test_expect_success 'setup' ' + cp UTF-16 file && + git add file && + git commit -m "add file in UTF-16" && + test_tick && + echo "file encoding=UTF-16" >.gitattributes +' + +test_expect_success 'diff --UTF-8 against local change' ' + cp file2 file && + test_tick && + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..06d06e4 100644 + a is converted to UTF-8 from UTF-16 + b is converted to UTF-8 from UTF-16 + --- a/file + +++ b/file + @@ -1,3 +1,3 @@ + -ärger + öse + übel + +ågren +EOF + git diff --UTF-8 file >actual && + test_cmp expect actual +' + +test_expect_success 'diff against local change' ' + cp file2 file && + test_tick && + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..06d06e4 100644 + Binary files a/file and b/file differ +EOF + git diff file >actual && + test_cmp expect actual +' + +test_expect_success 'commit local change' ' + git add file && + git commit -m "add file V2 in UTF-16" && + test_tick +' + +test_expect_success 'diff --UTF-8 HEAD against HEAD^' ' + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..06d06e4 100644 + a is converted to UTF-8 from UTF-16 + b is converted to UTF-8 from UTF-16 + --- a/file + +++ b/file + @@ -1,3 +1,3 @@ + -ärger + öse + übel + +ågren +EOF + git diff --UTF-8 HEAD^ HEAD -- file >actual && + test_cmp expect actual +' + +test_expect_success 'diff HEAD against HEAD^' ' + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..06d06e4 100644 + Binary files a/file and b/file differ +EOF + git diff HEAD^ HEAD -- file >actual && + test_cmp expect actual +' + +test_done -- 2.15.1.271.g1a4e40aa5d