From: Lars Schneider <larsxschneider@xxxxxxxxx> Hi, Patches 1-6,9 are preparation and helper functions. Patch 4 is new. Patch 7,8,10 are the actual change. This series depends on Torsten's 8462ff43e4 (convert_to_git(): safe_crlf/checksafe becomes int conv_flags, 2018-01-13) which is already in master. Changes since v11: * die if w-t-e is configured with a true/false (=undefined!) value (Junio) * improve same_encoding to detect all alternatives for UTF encodings (new commit, Junio) * squash in "advise canonical UTF encoding names" and remove commit (Junio) * fix erroneous # in comment (Junio) * force segv for non-UTF encodings in validate_encoding() (Junio) Thanks, Lars RFC: https://public-inbox.org/git/BDB9B884-6D17-4BE3-A83C-F67E2AFA2B46@xxxxxxxxx/ v1: https://public-inbox.org/git/20171211155023.1405-1-lars.schneider@xxxxxxxxxxxx/ v2: https://public-inbox.org/git/20171229152222.39680-1-lars.schneider@xxxxxxxxxxxx/ v3: https://public-inbox.org/git/20180106004808.77513-1-lars.schneider@xxxxxxxxxxxx/ v4: https://public-inbox.org/git/20180120152418.52859-1-lars.schneider@xxxxxxxxxxxx/ v5: https://public-inbox.org/git/20180129201855.9182-1-tboegi@xxxxxx/ v6: https://public-inbox.org/git/20180209132830.55385-1-lars.schneider@xxxxxxxxxxxx/ v7: https://public-inbox.org/git/20180215152711.158-1-lars.schneider@xxxxxxxxxxxx/ v8: https://public-inbox.org/git/20180224162801.98860-1-lars.schneider@xxxxxxxxxxxx/ v9: https://public-inbox.org/git/20180304201418.60958-1-lars.schneider@xxxxxxxxxxxx/ v10: https://public-inbox.org/git/20180307173026.30058-1-lars.schneider@xxxxxxxxxxxx/ v11: https://public-inbox.org/git/20180309173536.62012-1-lars.schneider@xxxxxxxxxxxx/ Base Ref: Web-Diff: https://github.com/larsxschneider/git/commit/0daedbbd76 Checkout: git fetch https://github.com/larsxschneider/git encoding-v12 && git checkout 0daedbbd76 ### Interdiff (v11..v12): diff --git a/convert.c b/convert.c index c2d24882c1..2a002af66d 100644 --- a/convert.c +++ b/convert.c @@ -280,13 +280,13 @@ static int validate_encoding(const char *path, const char *enc, /* * This advice is shown for UTF-??BE and UTF-??LE encodings. * We cut off the last two characters of the encoding name - # to generate the encoding name suitable for BOMs. + * to generate the encoding name suitable for BOMs. */ const char *advise_msg = _( "The file '%s' contains a byte order " "mark (BOM). Please use UTF-%s as " "working-tree-encoding."); - const char *stripped = ""; + const char *stripped = NULL; char *upper = xstrdup_toupper(enc); upper[strlen(upper)-2] = '\0'; if (!skip_prefix(upper, "UTF-", &stripped)) @@ -307,7 +307,7 @@ static int validate_encoding(const char *path, const char *enc, "mark (BOM). Please use UTF-%sBE or UTF-%sLE " "(depending on the byte order) as " "working-tree-encoding."); - const char *stripped = ""; + const char *stripped = NULL; char *upper = xstrdup_toupper(enc); if (!skip_prefix(upper, "UTF-", &stripped)) skip_prefix(stripped, "UTF", &stripped); @@ -1222,12 +1222,11 @@ static const char *git_path_check_encoding(struct attr_check_item *check) return NULL; if (ATTR_TRUE(value) || ATTR_FALSE(value)) { - error(_("working-tree-encoding attribute requires a value")); - return NULL; + die(_("working-tree-encoding attribute requires a value")); } /* Don't encode to the default encoding */ - if (is_encoding_utf8(value) && is_encoding_utf8(default_encoding)) + if (same_encoding(value, default_encoding)) return NULL; return value; diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index 07089bba2e..884f0878b1 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -149,25 +149,23 @@ done test_expect_success 'check unsupported encodings' ' test_when_finished "git reset --hard HEAD" && - echo "*.set text working-tree-encoding" >>.gitattributes && + echo "*.set text working-tree-encoding" >.gitattributes && printf "set" >t.set && - git add t.set 2>err.out && - test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && + test_must_fail git add t.set 2>err.out && + test_i18ngrep "working-tree-encoding attribute requires a value" err.out && - echo "*.unset text -working-tree-encoding" >>.gitattributes && + echo "*.unset text -working-tree-encoding" >.gitattributes && printf "unset" >t.unset && - git add t.unset 2>err.out && - test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && + git add t.unset && - echo "*.empty text working-tree-encoding=" >>.gitattributes && + echo "*.empty text working-tree-encoding=" >.gitattributes && printf "empty" >t.empty && - git add t.empty 2>err.out && - test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && + git add t.empty && - echo "*.garbage text working-tree-encoding=garbage" >>.gitattributes && + echo "*.garbage text working-tree-encoding=garbage" >.gitattributes && printf "garbage" >t.garbage && test_must_fail git add t.garbage 2>err.out && - test_i18ngrep "fatal: failed to encode" err.out + test_i18ngrep "failed to encode" err.out ' test_expect_success 'error if encoding round trip is not the same during refresh' ' diff --git a/utf8.c b/utf8.c index 81c6678df1..2d8821d36e 100644 --- a/utf8.c +++ b/utf8.c @@ -401,11 +401,27 @@ void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width, strbuf_release(&sb_dst); } +/* + * Returns true (1) if the src encoding name matches the dst encoding + * name directly or one of its alternative names. E.g. UTF-16BE is the + * same as UTF16BE. + */ +static int same_utf_encoding(const char *src, const char *dst) +{ + if (istarts_with(src, "utf") && istarts_with(dst, "utf")) { + /* src[3] or dst[3] might be '\0' */ + int i = (src[3] == '-' ? 4 : 3); + int j = (dst[3] == '-' ? 4 : 3); + return !strcasecmp(src+i, dst+j); + } + return 0; +} + int is_encoding_utf8(const char *name) { if (!name) return 1; - if (!strcasecmp(name, "utf-8") || !strcasecmp(name, "utf8")) + if (same_utf_encoding("utf-8", name)) return 1; return 0; } @@ -414,6 +430,8 @@ int same_encoding(const char *src, const char *dst) { if (is_encoding_utf8(src) && is_encoding_utf8(dst)) return 1; + if (same_utf_encoding(src, dst)) + return 1; return !strcasecmp(src, dst); } @@ -552,13 +570,13 @@ static const char utf32_le_bom[] = {0xFF, 0xFE, 0x00, 0x00}; int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) { return ( - (!strcasecmp(enc, "UTF-16BE") || !strcasecmp(enc, "UTF-16LE") || - !strcasecmp(enc, "UTF16BE") || !strcasecmp(enc, "UTF16LE")) && + (same_utf_encoding("UTF-16BE", enc) || + same_utf_encoding("UTF-16LE", enc)) && (has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) ) || ( - (!strcasecmp(enc, "UTF-32BE") || !strcasecmp(enc, "UTF-32LE") || - !strcasecmp(enc, "UTF32BE") || !strcasecmp(enc, "UTF32LE")) && + (same_utf_encoding("UTF-32BE", enc) || + same_utf_encoding("UTF-32LE", enc)) && (has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) ); @@ -567,11 +585,11 @@ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) int is_missing_required_utf_bom(const char *enc, const char *data, size_t len) { return ( - (!strcasecmp(enc, "UTF-16") || !strcasecmp(enc, "UTF16")) && + (same_utf_encoding(enc, "UTF-16")) && !(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) ) || ( - (!strcasecmp(enc, "UTF-32") || !strcasecmp(enc, "UTF32")) && + (same_utf_encoding(enc, "UTF-32")) && !(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) ); ### Patches Lars Schneider (10): strbuf: remove unnecessary NUL assignment in xstrdup_tolower() strbuf: add xstrdup_toupper() strbuf: add a case insensitive starts_with() utf8: teach same_encoding() alternative UTF encoding names utf8: add function to detect prohibited UTF-16/32 BOM utf8: add function to detect a missing UTF-16/32 BOM convert: add 'working-tree-encoding' attribute convert: check for detectable errors in UTF encodings convert: add tracing for 'working-tree-encoding' attribute convert: add round trip check based on 'core.checkRoundtripEncoding' Documentation/config.txt | 6 + Documentation/gitattributes.txt | 88 +++++++++++++ config.c | 5 + convert.c | 276 ++++++++++++++++++++++++++++++++++++++- convert.h | 2 + environment.c | 1 + git-compat-util.h | 1 + sha1_file.c | 2 +- strbuf.c | 22 +++- strbuf.h | 1 + t/t0028-working-tree-encoding.sh | 245 ++++++++++++++++++++++++++++++++++ utf8.c | 59 ++++++++- utf8.h | 28 ++++ 13 files changed, 732 insertions(+), 4 deletions(-) create mode 100755 t/t0028-working-tree-encoding.sh base-commit: 8a2f0888555ce46ac87452b194dec5cb66fb1417 -- 2.16.2