From: Lars Schneider <larsxschneider@xxxxxxxxx> Hi, Patches 1-5,8 are preparation and helper functions. Patch 3 is new. Patch 6,7,9 are the actual change. This series depends on Torsten's 8462ff43e4 (convert_to_git(): safe_crlf/checksafe becomes int conv_flags, 2018-01-13) which is already in master. Changes since v9: * make has_bom_prefix() / is_missing_required_utf_bom() more lenient in what they accept (ignore casing, accept UTF?? and UTF-?? , Junio) * replace memcmp() which does not check the length of the strings with a case insensitive variant of starts_with() (Junio) * do not convert encoding names to uppercase (this fixes a leak introduced in the last iteration, Eric) * do not cleanup test files that the test did not create (Eric) * do not cleanup err.out files in tests (Eric) I did not address Eric's feedback to make validate_encoding() cleaner [1] as I want to stabilize the series and Eric wrote that we can clean this up later: http://public-inbox.org/git/CAPig+cSoka-yBTYBz42JGQTyCH7LDWnToeOvdZfG0_64o9QnBQ@xxxxxxxxxxxxxx Thanks, Lars RFC: https://public-inbox.org/git/BDB9B884-6D17-4BE3-A83C-F67E2AFA2B46@xxxxxxxxx/ v1: https://public-inbox.org/git/20171211155023.1405-1-lars.schneider@xxxxxxxxxxxx/ v2: https://public-inbox.org/git/20171229152222.39680-1-lars.schneider@xxxxxxxxxxxx/ v3: https://public-inbox.org/git/20180106004808.77513-1-lars.schneider@xxxxxxxxxxxx/ v4: https://public-inbox.org/git/20180120152418.52859-1-lars.schneider@xxxxxxxxxxxx/ v5: https://public-inbox.org/git/20180129201855.9182-1-tboegi@xxxxxx/ v6: https://public-inbox.org/git/20180209132830.55385-1-lars.schneider@xxxxxxxxxxxx/ v7: https://public-inbox.org/git/20180215152711.158-1-lars.schneider@xxxxxxxxxxxx/ v8: https://public-inbox.org/git/20180224162801.98860-1-lars.schneider@xxxxxxxxxxxx/ v9: https://public-inbox.org/git/20180304201418.60958-1-lars.schneider@xxxxxxxxxxxx/ Base Ref: Web-Diff: https://github.com/larsxschneider/git/commit/a602b8dcef Checkout: git fetch https://github.com/larsxschneider/git encoding-v10 && git checkout a602b8dcef ### Interdiff (v9..v10): diff --git a/convert.c b/convert.c index 6cbb2b2618..e861f1abbc 100644 --- a/convert.c +++ b/convert.c @@ -269,7 +269,8 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, static int validate_encoding(const char *path, const char *enc, const char *data, size_t len, int die_on_error) { - if (!memcmp("UTF-", enc, 4)) { + /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ + if (startscase_with(enc, "UTF")) { /* * Check for detectable errors in UTF encodings */ @@ -277,16 +278,18 @@ static int validate_encoding(const char *path, const char *enc, const char *error_msg = _( "BOM is prohibited in '%s' if encoded as %s"); /* - * This advice is shown for UTF-??BE and UTF-??LE - * encodings. We truncate the encoding name to 6 - * chars with %.6s to cut off the last two "byte - * order" characters. + * This advice is shown for UTF-??BE and UTF-??LE encodings. + * We cut off the last two characters of the encoding name + # to generate the encoding name suitable for BOMs. */ const char *advise_msg = _( "The file '%s' contains a byte order " - "mark (BOM). Please use %.6s as " + "mark (BOM). Please use %s as " "working-tree-encoding."); - advise(advise_msg, path, enc); + char *upper_enc = xstrdup_toupper(enc); + upper_enc[strlen(upper_enc)-2] = '\0'; + advise(advise_msg, path, upper_enc); + free(upper_enc); if (die_on_error) die(error_msg, path, enc); else { @@ -301,7 +304,9 @@ static int validate_encoding(const char *path, const char *enc, "mark (BOM). Please use %sBE or %sLE " "(depending on the byte order) as " "working-tree-encoding."); - advise(advise_msg, path, enc, enc); + char *upper_enc = xstrdup_toupper(enc); + advise(advise_msg, path, upper_enc, upper_enc); + free(upper_enc); if (die_on_error) die(error_msg, path, enc); else { @@ -1216,11 +1221,7 @@ static const char *git_path_check_encoding(struct attr_check_item *check) if (!strcasecmp(value, default_encoding)) return NULL; - /* - * Ensure encoding names are always upper case (e.g. UTF-8) to - * simplify subsequent string comparisons. - */ - return xstrdup_toupper(value); + return value; } static enum crlf_action git_path_check_crlf(struct attr_check_item *check) diff --git a/git-compat-util.h b/git-compat-util.h index 68b2ad531e..f648da0c11 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -455,6 +455,7 @@ extern void (*get_warn_routine(void))(const char *warn, va_list params); extern void set_die_is_recursing_routine(int (*routine)(void)); extern int starts_with(const char *str, const char *prefix); +extern int startscase_with(const char *str, const char *prefix); /* * If the string "str" begins with the string found in "prefix", return 1. diff --git a/strbuf.c b/strbuf.c index b635f0bdc4..5779a2d591 100644 --- a/strbuf.c +++ b/strbuf.c @@ -11,6 +11,15 @@ int starts_with(const char *str, const char *prefix) return 0; } +int startscase_with(const char *str, const char *prefix) +{ + for (; ; str++, prefix++) + if (!*prefix) + return 1; + else if (tolower(*str) != tolower(*prefix)) + return 0; +} + int skip_to_optional_arg_default(const char *str, const char *prefix, const char **arg, const char *def) { diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index 23e89ae623..7cff41a350 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -52,7 +52,7 @@ test_expect_success 're-encode to UTF-16 on checkout' ' ' test_expect_success 'check $GIT_DIR/info/attributes support' ' - test_when_finished "rm -f test.utf8.raw test.utf32.raw test.utf32.git" && + test_when_finished "rm -f test.utf32.git" && test_when_finished "git reset --hard HEAD" && echo "*.utf32 text working-tree-encoding=utf-32" >.git/info/attributes && @@ -75,19 +75,19 @@ do # In these cases the BOM is prohibited. cp bebom.utf${i}be.raw bebom.utf${i}be && test_must_fail git add bebom.utf${i}be 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}BE" err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && cp lebom.utf${i}le.raw lebom.utf${i}be && test_must_fail git add lebom.utf${i}be 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}BE" err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && cp bebom.utf${i}be.raw bebom.utf${i}le && test_must_fail git add bebom.utf${i}le 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}LE" err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}le" err.out && cp lebom.utf${i}le.raw lebom.utf${i}le && test_must_fail git add lebom.utf${i}le 2>err.out && - test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}LE" err.out + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}le" err.out ' test_expect_success "check required UTF-${i} BOM" ' @@ -97,11 +97,11 @@ do cp nobom.utf${i}be.raw nobom.utf${i} && test_must_fail git add nobom.utf${i} 2>err.out && - test_i18ngrep "fatal: BOM is required .* UTF-${i}" err.out && + test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out && cp nobom.utf${i}le.raw nobom.utf${i} && test_must_fail git add nobom.utf${i} 2>err.out && - test_i18ngrep "fatal: BOM is required .* UTF-${i}" err.out + test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out ' test_expect_success "eol conversion for UTF-${i} encoded files on checkout" ' @@ -141,7 +141,6 @@ do done test_expect_success 'check unsupported encodings' ' - test_when_finished "rm -f err.out" && test_when_finished "git reset --hard HEAD" && echo "*.nothing text working-tree-encoding=" >>.gitattributes && @@ -156,7 +155,6 @@ test_expect_success 'check unsupported encodings' ' test_expect_success 'error if encoding round trip is not the same during refresh' ' BEFORE_STATE=$(git rev-parse HEAD) && - test_when_finished "rm -f err.out" && test_when_finished "git reset --hard $BEFORE_STATE" && # Add and commit a UTF-16 file but skip the "working-tree-encoding" @@ -176,7 +174,6 @@ test_expect_success 'error if encoding round trip is not the same during refresh test_expect_success 'error if encoding garbage is already in Git' ' BEFORE_STATE=$(git rev-parse HEAD) && - test_when_finished "rm -f err.out" && test_when_finished "git reset --hard $BEFORE_STATE" && # Skip the UTF-16 filter for the added file @@ -219,14 +216,14 @@ test_expect_success 'check roundtrip encoding' ' # ... unless we tell Git to check it! GIT_TRACE=1 git -c core.checkRoundtripEncoding="UTF-16, UTF-32" \ add roundtrip.utf16 2>&1 | - grep "Checking roundtrip encoding for UTF-16" && + grep "Checking roundtrip encoding for utf-16" && git reset && # ... unless we tell Git to check it! # (here we also check that the casing of the encoding is irrelevant) GIT_TRACE=1 git -c core.checkRoundtripEncoding="UTF-32, utf-16" \ add roundtrip.utf16 2>&1 | - grep "Checking roundtrip encoding for UTF-16" && + grep "Checking roundtrip encoding for utf-16" && git reset ' diff --git a/utf8.c b/utf8.c index 5113d26e56..81c6678df1 100644 --- a/utf8.c +++ b/utf8.c @@ -552,11 +552,13 @@ static const char utf32_le_bom[] = {0xFF, 0xFE, 0x00, 0x00}; int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) { return ( - (!strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")) && + (!strcasecmp(enc, "UTF-16BE") || !strcasecmp(enc, "UTF-16LE") || + !strcasecmp(enc, "UTF16BE") || !strcasecmp(enc, "UTF16LE")) && (has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) ) || ( - (!strcmp(enc, "UTF-32BE") || !strcmp(enc, "UTF-32LE")) && + (!strcasecmp(enc, "UTF-32BE") || !strcasecmp(enc, "UTF-32LE") || + !strcasecmp(enc, "UTF32BE") || !strcasecmp(enc, "UTF32LE")) && (has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) ); @@ -565,11 +567,11 @@ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) int is_missing_required_utf_bom(const char *enc, const char *data, size_t len) { return ( - !strcmp(enc, "UTF-16") && + (!strcasecmp(enc, "UTF-16") || !strcasecmp(enc, "UTF16")) && !(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) ) || ( - !strcmp(enc, "UTF-32") && + (!strcasecmp(enc, "UTF-32") || !strcasecmp(enc, "UTF32")) && !(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) ); ### Patches Lars Schneider (9): strbuf: remove unnecessary NUL assignment in xstrdup_tolower() strbuf: add xstrdup_toupper() strbuf: add a case insensitive starts_with() utf8: add function to detect prohibited UTF-16/32 BOM utf8: add function to detect a missing UTF-16/32 BOM convert: add 'working-tree-encoding' attribute convert: check for detectable errors in UTF encodings convert: add tracing for 'working-tree-encoding' attribute convert: add round trip check based on 'core.checkRoundtripEncoding' Documentation/config.txt | 6 + Documentation/gitattributes.txt | 88 +++++++++++++ config.c | 5 + convert.c | 268 ++++++++++++++++++++++++++++++++++++++- convert.h | 2 + environment.c | 1 + git-compat-util.h | 1 + sha1_file.c | 2 +- strbuf.c | 22 +++- strbuf.h | 1 + t/t0028-working-tree-encoding.sh | 230 +++++++++++++++++++++++++++++++++ utf8.c | 39 ++++++ utf8.h | 28 ++++ 13 files changed, 690 insertions(+), 3 deletions(-) create mode 100755 t/t0028-working-tree-encoding.sh base-commit: 8a2f0888555ce46ac87452b194dec5cb66fb1417 -- 2.16.2