From: Lars Schneider <larsxschneider@xxxxxxxxx> Check that new content is valid with respect to the user defined 'working-tree-encoding' attribute. Signed-off-by: Lars Schneider <larsxschneider@xxxxxxxxx> --- convert.c | 61 +++++++++++++++++++++++++++++++++++++++ t/t0028-working-tree-encoding.sh | 62 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/convert.c b/convert.c index 85e49741af..3cab4fa907 100644 --- a/convert.c +++ b/convert.c @@ -266,6 +266,64 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static int validate_encoding(const char *path, const char *enc, + const char *data, size_t len, int die_on_error) +{ + /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ + if (istarts_with(enc, "UTF")) { + /* + * Check for detectable errors in UTF encodings + */ + if (has_prohibited_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is prohibited in '%s' if encoded as %s"); + /* + * This advice is shown for UTF-??BE and UTF-??LE encodings. + * We cut off the last two characters of the encoding name + * to generate the encoding name suitable for BOMs. + */ + const char *advise_msg = _( + "The file '%s' contains a byte order " + "mark (BOM). Please use UTF-%s as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + upper[strlen(upper)-2] = '\0'; + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + + } else if (is_missing_required_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is required in '%s' if encoded as %s"); + const char *advise_msg = _( + "The file '%s' is missing a byte order " + "mark (BOM). Please use UTF-%sBE or UTF-%sLE " + "(depending on the byte order) as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + } + + } + return 0; +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -291,6 +349,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, if (!buf && !src) return 1; + if (validate_encoding(path, enc, src, src_len, die_on_error)) + return 0; + dst = reencode_string_len(src, src_len, default_encoding, enc, &dst_len); if (!dst) { diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index d67dbde1d4..1bb528b339 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -62,6 +62,52 @@ test_expect_success 'check $GIT_DIR/info/attributes support' ' for i in 16 32 do + test_expect_success "check prohibited UTF-${i} BOM" ' + test_when_finished "git reset --hard HEAD" && + + echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes && + echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" >>.gitattributes && + + # Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian) + # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32). + # In these cases the BOM is prohibited. + cp bebom.utf${i}be.raw bebom.utf${i}be && + test_must_fail git add bebom.utf${i}be 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && + + cp lebom.utf${i}le.raw lebom.utf${i}be && + test_must_fail git add lebom.utf${i}be 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && + + cp bebom.utf${i}be.raw bebom.utf${i}le && + test_must_fail git add bebom.utf${i}le 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out && + + cp lebom.utf${i}le.raw lebom.utf${i}le && + test_must_fail git add lebom.utf${i}le 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out && + test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out + ' + + test_expect_success "check required UTF-${i} BOM" ' + test_when_finished "git reset --hard HEAD" && + + echo "*.utf${i} text working-tree-encoding=utf-${i}" >>.gitattributes && + + cp nobom.utf${i}be.raw nobom.utf${i} && + test_must_fail git add nobom.utf${i} 2>err.out && + test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out && + test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out && + + cp nobom.utf${i}le.raw nobom.utf${i} && + test_must_fail git add nobom.utf${i} 2>err.out && + test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out && + test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out + ' + test_expect_success "eol conversion for UTF-${i} encoded files on checkout" ' test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" && test_when_finished "git reset --hard HEAD^" && @@ -139,4 +185,20 @@ test_expect_success 'error if encoding round trip is not the same during refresh test_i18ngrep "error: .* overwritten by checkout:" err.out ' +test_expect_success 'error if encoding garbage is already in Git' ' + BEFORE_STATE=$(git rev-parse HEAD) && + test_when_finished "git reset --hard $BEFORE_STATE" && + + # Skip the UTF-16 filter for the added file + # This simulates a Git version that has no checkoutEncoding support + cp nobom.utf16be.raw nonsense.utf16 && + TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) && + git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 && + COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) && + git update-ref refs/heads/master $COMMIT && + + git diff 2>err.out && + test_i18ngrep "error: BOM is required" err.out +' + test_done -- 2.16.2