From: Lars Schneider <larsxschneider@xxxxxxxxx> Hi, Patches 1-5,9 are preparation and helper functions. Patch 6-8,10 are the actual change. Patch 8 is new. This series depends on Torsten's 8462ff43e4 (convert_to_git(): safe_crlf/checksafe becomes int conv_flags, 2018-01-13) which is already in master. Changes since v10: * rename startscase_with() to istarts_with() (Duy) * validate_encoding() advises the canonical form of the UTF encoding name to the user (Junio) --> I added it this as a separate commit that you could be dropped if desired by the reviewers. * fix documentation for roundtrip check (Junio) * use isspace() to check whitespace/tab delimiter in core.checkRoundtripEncoding (Junio) * remove dead code in roundtrip check (Junio) * fix invalid # in comment (Eric) * detect UTF8 and UTF-8 as default encoding (Eric) * make asterisk stick to the variable, not type (Junio) * print an error if "w-t-e" does not have a proper value (Junio) --> BTW: I noticed that the attribute is not set to "git_attr__false" even if I define "-working-tree-encoding". I haven't investigated further yet. Might that be a bug? If yes, then this should be addresses in a separate patch series. Thanks, Lars RFC: https://public-inbox.org/git/BDB9B884-6D17-4BE3-A83C-F67E2AFA2B46@xxxxxxxxx/ v1: https://public-inbox.org/git/20171211155023.1405-1-lars.schneider@xxxxxxxxxxxx/ v2: https://public-inbox.org/git/20171229152222.39680-1-lars.schneider@xxxxxxxxxxxx/ v3: https://public-inbox.org/git/20180106004808.77513-1-lars.schneider@xxxxxxxxxxxx/ v4: https://public-inbox.org/git/20180120152418.52859-1-lars.schneider@xxxxxxxxxxxx/ v5: https://public-inbox.org/git/20180129201855.9182-1-tboegi@xxxxxx/ v6: https://public-inbox.org/git/20180209132830.55385-1-lars.schneider@xxxxxxxxxxxx/ v7: https://public-inbox.org/git/20180215152711.158-1-lars.schneider@xxxxxxxxxxxx/ v8: https://public-inbox.org/git/20180224162801.98860-1-lars.schneider@xxxxxxxxxxxx/ v9: https://public-inbox.org/git/20180304201418.60958-1-lars.schneider@xxxxxxxxxxxx/ v10: https://public-inbox.org/git/20180307173026.30058-1-lars.schneider@xxxxxxxxxxxx/ Base Ref: Web-Diff: https://github.com/larsxschneider/git/commit/afc02ce2e0 Checkout: git fetch https://github.com/larsxschneider/git encoding-v11 && git checkout afc02ce2e0 ### Interdiff (v10..v11): diff --git a/Documentation/config.txt b/Documentation/config.txt index d7a56054a5..7dcac9b540 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -531,10 +531,10 @@ core.autocrlf:: in which case no output conversion is performed. core.checkRoundtripEncoding:: - A comma separated list of encodings that Git performs UTF-8 round - trip checks on if they are used in an `working-tree-encoding` - attribute (see linkgit:gitattributes[5]). The default value is - `SHIFT-JIS`. + A comma and/or whitespace separated list of encodings that Git + performs UTF-8 round trip checks on if they are used in an + `working-tree-encoding` attribute (see linkgit:gitattributes[5]). + The default value is `SHIFT-JIS`. core.symlinks:: If false, symbolic links are checked out as small plain files that diff --git a/convert.c b/convert.c index e861f1abbc..c2d24882c1 100644 --- a/convert.c +++ b/convert.c @@ -270,7 +270,7 @@ static int validate_encoding(const char *path, const char *enc, const char *data, size_t len, int die_on_error) { /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ - if (startscase_with(enc, "UTF")) { + if (istarts_with(enc, "UTF")) { /* * Check for detectable errors in UTF encodings */ @@ -284,12 +284,15 @@ static int validate_encoding(const char *path, const char *enc, */ const char *advise_msg = _( "The file '%s' contains a byte order " - "mark (BOM). Please use %s as " + "mark (BOM). Please use UTF-%s as " "working-tree-encoding."); - char *upper_enc = xstrdup_toupper(enc); - upper_enc[strlen(upper_enc)-2] = '\0'; - advise(advise_msg, path, upper_enc); - free(upper_enc); + const char *stripped = ""; + char *upper = xstrdup_toupper(enc); + upper[strlen(upper)-2] = '\0'; + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped); + free(upper); if (die_on_error) die(error_msg, path, enc); else { @@ -301,12 +304,15 @@ static int validate_encoding(const char *path, const char *enc, "BOM is required in '%s' if encoded as %s"); const char *advise_msg = _( "The file '%s' is missing a byte order " - "mark (BOM). Please use %sBE or %sLE " + "mark (BOM). Please use UTF-%sBE or UTF-%sLE " "(depending on the byte order) as " "working-tree-encoding."); - char *upper_enc = xstrdup_toupper(enc); - advise(advise_msg, path, upper_enc, upper_enc); - free(upper_enc); + const char *stripped = ""; + char *upper = xstrdup_toupper(enc); + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped, stripped); + free(upper); if (die_on_error) die(error_msg, path, enc); else { @@ -344,8 +350,8 @@ static void trace_encoding(const char *context, const char *path, static int check_roundtrip(const char *enc_name) { /* - * check_roundtrip_encoding contains a string of space and/or - * comma separated encodings (eg. "UTF-16, ASCII, CP1125"). + * check_roundtrip_encoding contains a string of comma and/or + * space separated encodings (eg. "UTF-16, ASCII, CP1125"). * Search for the given encoding in that string. */ const char *found = strcasestr(check_roundtrip_encoding, enc_name); @@ -362,8 +368,7 @@ static int check_roundtrip(const char* enc_name) * that it is prefixed with a space or comma */ found == check_roundtrip_encoding || ( - found > check_roundtrip_encoding && - (*(found-1) == ' ' || *(found-1) == ',') + (isspace(found[-1]) || found[-1] == ',') ) ) && ( /* @@ -373,7 +378,7 @@ static int check_roundtrip(const char* enc_name) */ next == check_roundtrip_encoding + len || ( next < check_roundtrip_encoding + len && - (*next == ' ' || *next == ',') + (isspace(next[0]) || next[0] == ',') ) )); } @@ -1213,12 +1218,16 @@ static const char *git_path_check_encoding(struct attr_check_item *check) { const char *value = check->value; - if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value) || - !strlen(value)) + if (ATTR_UNSET(value) || !strlen(value)) return NULL; + if (ATTR_TRUE(value) || ATTR_FALSE(value)) { + error(_("working-tree-encoding attribute requires a value")); + return NULL; + } + /* Don't encode to the default encoding */ - if (!strcasecmp(value, default_encoding)) + if (is_encoding_utf8(value) && is_encoding_utf8(default_encoding)) return NULL; return value; diff --git a/git-compat-util.h b/git-compat-util.h index f648da0c11..95c9b34832 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -455,7 +455,7 @@ extern void (*get_warn_routine(void))(const char *warn, va_list params); extern void set_die_is_recursing_routine(int (*routine)(void)); extern int starts_with(const char *str, const char *prefix); -extern int startscase_with(const char *str, const char *prefix); +extern int istarts_with(const char *str, const char *prefix); /* * If the string "str" begins with the string found in "prefix", return 1. diff --git a/strbuf.c b/strbuf.c index 5779a2d591..99812b8488 100644 --- a/strbuf.c +++ b/strbuf.c @@ -11,7 +11,7 @@ int starts_with(const char *str, const char *prefix) return 0; } -int startscase_with(const char *str, const char *prefix) +int istarts_with(const char *str, const char *prefix) { for (; ; str++, prefix++) if (!*prefix) diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index 7cff41a350..07089bba2e 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -68,7 +68,7 @@ do test_when_finished "git reset --hard HEAD" && echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes && - echo "*.utf${i}le text working-tree-encoding=utf-${i}le" >>.gitattributes && + echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" >>.gitattributes && # Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian) # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32). @@ -76,18 +76,22 @@ do cp bebom.utf${i}be.raw bebom.utf${i}be && test_must_fail git add bebom.utf${i}be 2>err.out && test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && cp lebom.utf${i}le.raw lebom.utf${i}be && test_must_fail git add lebom.utf${i}be 2>err.out && test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && cp bebom.utf${i}be.raw bebom.utf${i}le && test_must_fail git add bebom.utf${i}le 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* utf-${i}le" err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && cp lebom.utf${i}le.raw lebom.utf${i}le && test_must_fail git add lebom.utf${i}le 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* utf-${i}le" err.out + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out ' test_expect_success "check required UTF-${i} BOM" ' @@ -98,10 +102,12 @@ do cp nobom.utf${i}be.raw nobom.utf${i} && test_must_fail git add nobom.utf${i} 2>err.out && test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out && + test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out && cp nobom.utf${i}le.raw nobom.utf${i} && test_must_fail git add nobom.utf${i} 2>err.out && - test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out + test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out && + test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out ' test_expect_success "eol conversion for UTF-${i} encoded files on checkout" ' @@ -143,9 +149,20 @@ done test_expect_success 'check unsupported encodings' ' test_when_finished "git reset --hard HEAD" && - echo "*.nothing text working-tree-encoding=" >>.gitattributes && - printf "nothing" >t.nothing && - git add t.nothing && + echo "*.set text working-tree-encoding" >>.gitattributes && + printf "set" >t.set && + git add t.set 2>err.out && + test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && + + echo "*.unset text -working-tree-encoding" >>.gitattributes && + printf "unset" >t.unset && + git add t.unset 2>err.out && + test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && + + echo "*.empty text working-tree-encoding=" >>.gitattributes && + printf "empty" >t.empty && + git add t.empty 2>err.out && + test_i18ngrep "error: working-tree-encoding attribute requires a value" err.out && echo "*.garbage text working-tree-encoding=garbage" >>.gitattributes && printf "garbage" >t.garbage && ### Patches Lars Schneider (10): strbuf: remove unnecessary NUL assignment in xstrdup_tolower() strbuf: add xstrdup_toupper() strbuf: add a case insensitive starts_with() utf8: add function to detect prohibited UTF-16/32 BOM utf8: add function to detect a missing UTF-16/32 BOM convert: add 'working-tree-encoding' attribute convert: check for detectable errors in UTF encodings convert: advise canonical UTF encoding names convert: add tracing for 'working-tree-encoding' attribute convert: add round trip check based on 'core.checkRoundtripEncoding' Documentation/config.txt | 6 + Documentation/gitattributes.txt | 88 +++++++++++++ config.c | 5 + convert.c | 277 ++++++++++++++++++++++++++++++++++++++- convert.h | 2 + environment.c | 1 + git-compat-util.h | 1 + sha1_file.c | 2 +- strbuf.c | 22 +++- strbuf.h | 1 + t/t0028-working-tree-encoding.sh | 247 ++++++++++++++++++++++++++++++++++ utf8.c | 39 ++++++ utf8.h | 28 ++++ 13 files changed, 716 insertions(+), 3 deletions(-) create mode 100755 t/t0028-working-tree-encoding.sh base-commit: 8a2f0888555ce46ac87452b194dec5cb66fb1417 -- 2.16.2