From: Lars Schneider <larsxschneider@xxxxxxxxx> Check that new content is valid with respect to the user defined 'working-tree-encoding' attribute. Signed-off-by: Lars Schneider <larsxschneider@xxxxxxxxx> --- convert.c | 50 +++++++++++++++++++++++++++++++++++ t/t0028-working-tree-encoding.sh | 57 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/convert.c b/convert.c index 2f864df258..9647b06679 100644 --- a/convert.c +++ b/convert.c @@ -266,6 +266,53 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static int validate_encoding(const char *path, const char *enc, + const char *data, size_t len, int die_on_error) +{ + if (!memcmp("UTF-", enc, 4)) { + /* + * Check for detectable errors in UTF encodings + */ + if (has_prohibited_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is prohibited in '%s' if encoded as %s"); + /* + * This advice is shown for UTF-??BE and UTF-??LE + * encodings. We truncate the encoding name to 6 + * chars with %.6s to cut off the last two "byte + * order" characters. + */ + const char *advise_msg = _( + "The file '%s' contains a byte order " + "mark (BOM). Please use %.6s as " + "working-tree-encoding."); + advise(advise_msg, path, enc); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + + } else if (is_missing_required_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is required in '%s' if encoded as %s"); + const char *advise_msg = _( + "The file '%s' is missing a byte order " + "mark (BOM). Please use %sBE or %sLE " + "(depending on the byte order) as " + "working-tree-encoding."); + advise(advise_msg, path, enc, enc); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + } + + } + return 0; +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -291,6 +338,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, if (!buf && !src) return 1; + if (validate_encoding(path, enc, src, src_len, die_on_error)) + return 0; + dst = reencode_string_len(src, src_len, default_encoding, enc, &dst_len); if (!dst) { diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index 71e8e3700b..5c7e36a164 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -62,6 +62,46 @@ test_expect_success 'check $GIT_DIR/info/attributes support' ' for i in 16 32 do + test_expect_success "check prohibited UTF-${i} BOM" ' + test_when_finished "git reset --hard HEAD" && + + echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes && + echo "*.utf${i}le text working-tree-encoding=utf-${i}le" >>.gitattributes && + + # Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian) + # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32). + # In these cases the BOM is prohibited. + cp bebom.utf${i}be.raw bebom.utf${i}be && + test_must_fail git add bebom.utf${i}be 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}BE" err.out && + + cp lebom.utf${i}le.raw lebom.utf${i}be && + test_must_fail git add lebom.utf${i}be 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}BE" err.out && + + cp bebom.utf${i}be.raw bebom.utf${i}le && + test_must_fail git add bebom.utf${i}le 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}LE" err.out && + + cp lebom.utf${i}le.raw lebom.utf${i}le && + test_must_fail git add lebom.utf${i}le 2>err.out && + test_i18ngrep "fatal: BOM is prohibited .* UTF-${i}LE" err.out + ' + + test_expect_success "check required UTF-${i} BOM" ' + test_when_finished "git reset --hard HEAD" && + + echo "*.utf${i} text working-tree-encoding=utf-${i}" >>.gitattributes && + + cp nobom.utf${i}be.raw nobom.utf${i} && + test_must_fail git add nobom.utf${i} 2>err.out && + test_i18ngrep "fatal: BOM is required .* UTF-${i}" err.out && + + cp nobom.utf${i}le.raw nobom.utf${i} && + test_must_fail git add nobom.utf${i} 2>err.out && + test_i18ngrep "fatal: BOM is required .* UTF-${i}" err.out + ' + test_expect_success "eol conversion for UTF-${i} encoded files on checkout" ' test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" && test_when_finished "git reset --hard HEAD^" && @@ -132,4 +172,21 @@ test_expect_success 'error if encoding round trip is not the same during refresh test_i18ngrep "error: .* overwritten by checkout:" err.out ' +test_expect_success 'error if encoding garbage is already in Git' ' + BEFORE_STATE=$(git rev-parse HEAD) && + test_when_finished "rm -f err.out" && + test_when_finished "git reset --hard $BEFORE_STATE" && + + # Skip the UTF-16 filter for the added file + # This simulates a Git version that has no checkoutEncoding support + cp nobom.utf16be.raw nonsense.utf16 && + TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) && + git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 && + COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) && + git update-ref refs/heads/master $COMMIT && + + git diff 2>err.out && + test_i18ngrep "error: BOM is required" err.out +' + test_done -- 2.16.2