On Fri, Feb 16, 2018 at 03:42:35PM +0100, Lars Schneider wrote: [] > > Agreed. However, people using ShiftJIS are not my target audience. > My target audience are: > > (1) People that have to encode their text files in UTF-16 (for > whatever reason - usually because of legacy processes or tools). > > (2) People that want to see textual diffs of their UTF-16 encoded > files in their Git tools without special adjustments (on the > command line, on the web, etc). > > That was my primary motivation. The fact that w-t-e supports any > other encoding too is just a nice side effect. I don't foresee people > using other w-t-encodings other than UTF-16 in my organization. > > I have the suspicion that the feature could be useful for the Git > community at large. Consider this Stack Overflow question: > https://stackoverflow.com/questions/777949/can-i-make-git-recognize-a-utf-16-file-as-text > > This question was viewed 42k times and there is no good solution. > I believe w-t-e could be a good solution. > If it was only about a diff of UTF-16 files, I may suggest a patch. I simply copy-paste it here for review, if someone thinks that it may be useful, I can send it as a real patch/RFC. git show HEAD commit 9f7d43f29eaf6017b7b16261ce91d8ef182cf415 Author: Torsten Bögershausen <tboegi@xxxxxx> Date: Fri Feb 2 15:35:23 2018 +0100 Auto diff of UTF-16 files in UTF-8 When an UTF-16 file is commited and later changed, `git diff` shows "Binary files XX and YY differ". When the user wants a diff in UTF-8, a textconv needs to be specified in .gitattributes and the textconv must be configured. A more user-friendly diff can be produced for UTF-16 if - the user did not use `git diff --binary` - the blob is identified as binary - the blob has an UTF-16 BOM - the blob can be converted into UTF-8 Enhance the diff machinery to auto-detect UTF-16 blobs and show them as UTF-8, unless the user specifies `git diff --binary` which creates a binary diff. diff --git a/diff.c b/diff.c index fb22b19f09..51831ee94d 100644 --- a/diff.c +++ b/diff.c @@ -3192,6 +3192,10 @@ static void builtin_diff(const char *name_a, strbuf_reset(&header); } + if (one && one->reencoded_from_utf16) + strbuf_addf(&header, "a is converted to UTF-8 from UTF-16\n"); + if (two && two->reencoded_from_utf16) + strbuf_addf(&header, "b is converted to UTF-8 from UTF-16\n"); mf1.size = fill_textconv(textconv_one, one, &mf1.ptr); mf2.size = fill_textconv(textconv_two, two, &mf2.ptr); @@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) s->size = size; s->should_free = 1; } - } - else { + if (!s->binary && buffer_is_binary(s->data, s->size) && + buffer_has_utf16_bom(s->data, s->size)) { + int outsz = 0; + char *outbuf; + outbuf = reencode_string_len(s->data, (int)s->size, + "UTF-8", "UTF-16", &outsz); + if (outbuf) { + if (s->should_free) + free(s->data); + if (s->should_munmap) + munmap(s->data, s->size); + s->should_munmap = 0; + s->data = outbuf; + s->size = outsz; + s->reencoded_from_utf16 = 1; + s->should_free = 1; + } + } + } else { enum object_type type; if (size_only || (flags & CHECK_BINARY)) { type = sha1_object_info(s->oid.hash, &s->size); @@ -3629,6 +3650,19 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) s->data = read_sha1_file(s->oid.hash, &type, &s->size); if (!s->data) die("unable to read %s", oid_to_hex(&s->oid)); + if (!s->binary && buffer_is_binary(s->data, s->size) && + buffer_has_utf16_bom(s->data, s->size)) { + int outsz = 0; + char *buf; + buf = reencode_string_len(s->data, (int)s->size, + "UTF-8", "UTF-16", &outsz); + if (buf) { + free(s->data); + s->data = buf; + s->size = outsz; + s->reencoded_from_utf16 = 1; + } + } s->should_free = 1; } return 0; @@ -5695,6 +5729,10 @@ static int diff_filespec_is_identical(struct diff_filespec *one, static int diff_filespec_check_stat_unmatch(struct diff_filepair *p) { + if (p->binary) { + p->one->binary = 1; + p->two->binary = 1; + } if (p->done_skip_stat_unmatch) return p->skip_stat_unmatch_result; @@ -5735,6 +5773,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt) for (i = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; + p->binary = diffopt->flags.binary; if (diff_filespec_check_stat_unmatch(p)) diff_q(&outq, p); else { diff --git a/diffcore.h b/diffcore.h index a30da161da..3cd97bb93b 100644 --- a/diffcore.h +++ b/diffcore.h @@ -47,6 +47,8 @@ struct diff_filespec { unsigned has_more_entries : 1; /* only appear in combined diff */ /* data should be considered "binary"; -1 means "don't know yet" */ signed int is_binary : 2; + unsigned binary : 1; + unsigned reencoded_from_utf16 : 1; struct userdiff_driver *driver; }; @@ -72,6 +74,7 @@ struct diff_filepair { unsigned is_unmerged : 1; unsigned done_skip_stat_unmatch : 1; unsigned skip_stat_unmatch_result : 1; + unsigned binary : 1; }; #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged) diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh new file mode 100755 index 0000000000..9bb3c70ada --- /dev/null +++ b/t/t4066-diff-encoding.sh @@ -0,0 +1,98 @@ +#!/bin/sh + +test_description='git diff with encoding attribute' + +. ./test-lib.sh + +printf '\303\244rger\n\303\266se\n\303\274bel\n' | + iconv -f UTF-8 -t UTF-16 >UTF-16 +printf '\303\266se\n\303\274bel\n\303\245st\n' | + iconv -f UTF-8 -t UTF-16 >file2 + +test_expect_success 'setup' ' + cp UTF-16 file && + git add file && + git commit -m "add file in UTF-16" && + test_tick && + echo "file encoding=UTF-16" >.gitattributes +' + +test_expect_success 'diff against local change' ' + cp file2 file && + test_tick && + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..e98d27a 100644 + a is converted to UTF-8 from UTF-16 + b is converted to UTF-8 from UTF-16 + --- a/file + +++ b/file + @@ -1,3 +1,3 @@ + -ärger + öse + übel + +åst +EOF + git diff file >actual && + test_cmp expect actual +' + +test_expect_success 'diff --binary against local change' ' + cp file2 file && + test_tick && + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644 + GIT binary patch + literal 28 + ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi + + literal 32 + icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4 + +EOF + git diff --binary file >actual && + test_cmp expect actual +' + +test_expect_success 'commit local change' ' + git add file && + git commit -m "add file V2 in UTF-16" && + test_tick +' + +test_expect_success 'diff HEAD against HEAD^' ' + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09..e98d27a 100644 + a is converted to UTF-8 from UTF-16 + b is converted to UTF-8 from UTF-16 + --- a/file + +++ b/file + @@ -1,3 +1,3 @@ + -ärger + öse + übel + +åst +EOF + git diff HEAD^ HEAD -- file >actual && + test_cmp expect actual +' + +test_expect_success 'diff --binary HEAD against HEAD^' ' + cat >expect <<-\EOF && + diff --git a/file b/file + index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644 + GIT binary patch + literal 28 + ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi + + literal 32 + icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4 + +EOF + git diff --binary HEAD^ HEAD -- file >actual && + test_cmp expect actual +' + +test_done diff --git a/utf8.h b/utf8.h index 6bbcf31a83..a2184d0300 100644 --- a/utf8.h +++ b/utf8.h @@ -16,6 +16,17 @@ int utf8_fprintf(FILE *, const char *, ...); extern const char utf8_bom[]; extern int skip_utf8_bom(char **, size_t); +static inline int buffer_has_utf16_bom(const void *buf, size_t len) { + const unsigned char *text = (unsigned char *)buf; + if (!text || len < 2) + return 0; + if (text[0] == 0xff && text[1] == 0xfe) + return 1; + if (text[0] == 0xfe && text[1] == 0xff) + return 1; + return 0; +} + void strbuf_add_wrapped_text(struct strbuf *buf, const char *text, int indent, int indent2, int width); void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,