While stress testing `git filter-repo`, I noticed an issue with encoding; further digging led to the fixes and features in this series. See the individual commit messages for details. Changes since v3 (full range-diff below): * YES/NO changes suggested by Torsten * more boolean synonyms as suggested by Junio * check for the exact expected special bytes, in addition to the size (Dscho pointed out that it was GitForWindows that munged bytes, not Windows, so while I need to be careful in what I pass to git, printf and grep can work directly with the special bytes) * also checked on gitgitgadget that it passes on the major platforms [1] https://github.com/gitgitgadget/git/pull/191 Elijah Newren (5): t9350: fix encoding test to actually test reencoding fast-import: support 'encoding' commit header fast-export: avoid stripping encoding header if we cannot reencode fast-export: differentiate between explicitly utf-8 and implicitly utf-8 fast-export: do automatic reencoding of commit messages only if requested Documentation/git-fast-import.txt | 7 ++ builtin/fast-export.c | 44 +++++++++-- fast-import.c | 11 ++- t/t9300-fast-import.sh | 20 +++++ t/t9350-fast-export.sh | 78 +++++++++++++++++--- t/t9350/broken-iso-8859-7-commit-message.txt | 1 + t/t9350/simple-iso-8859-7-commit-message.txt | 1 + 7 files changed, 145 insertions(+), 17 deletions(-) create mode 100644 t/t9350/broken-iso-8859-7-commit-message.txt create mode 100644 t/t9350/simple-iso-8859-7-commit-message.txt Range-diff: 1: 2d7bb64acf ! 1: 37a68a0ffd t9350: fix encoding test to actually test reencoding @@ -39,18 +39,16 @@ git fast-import && + # The commit object, if not re-encoded, would be 240 bytes. + # Removing the "encoding iso-8859-7\n" header drops 20 bytes. -+ # Re-encoding the Pi character from \xF0 in iso-8859-7 to -+ # \xCF\x80 in utf-8 adds a byte. Grepping for specific bytes -+ # would be nice, but Windows apparently munges user data -+ # in the form of bytes on the command line to force them to -+ # be characters instead, so we are limited for portability -+ # reasons in subsequent similar tests in this file to check -+ # for size rather than what bytes are present. ++ # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7 ++ # to \xCF\x80 (\317\200) in utf-8 adds a byte. Check for ++ # the expected size. + test 221 -eq "$(git cat-file -s i18n)" && -+ # Also make sure the commit does not have the "encoding" header ++ # ...and for the expected translation of bytes. git cat-file commit i18n >actual && - grep "Áéí óú" actual) - ++ grep $(printf "\317\200") actual && ++ # Also make sure the commit does not have the "encoding" header + ! grep ^encoding actual) ' + 2: 9fa5695017 = 2: 3d84f4613d fast-import: support 'encoding' commit header 3: dfc76573e9 ! 3: baa8394a3a fast-export: avoid stripping encoding header if we cannot reencode @@ -49,10 +49,14 @@ + (cd new && + git fast-import && + git cat-file commit i18n-invalid >actual && ++ # Make sure the commit still has the encoding header + grep ^encoding actual && -+ # Also verify that the commit has the expected size; i.e. ++ # Verify that the commit has the expected size; i.e. + # that no bytes were re-encoded to a different encoding. -+ test 252 -eq "$(git cat-file -s i18n-invalid)") ++ test 252 -eq "$(git cat-file -s i18n-invalid)" && ++ # ...and check for the original special bytes ++ grep $(printf "\360") actual && ++ grep $(printf "\377") actual) +' + test_expect_success 'import/export-marks' ' 4: 83b3656b76 = 4: 49960164c6 fast-export: differentiate between explicitly utf-8 and implicitly utf-8 5: 2063122293 ! 5: 571613a09e fast-export: do automatic reencoding of commit messages only if requested @@ -20,7 +20,7 @@ static int progress; static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT; static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT; -+static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode = REENCODE_ABORT; ++static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT; static int fake_missing_tagger; static int use_done_feature; static int no_data; @@ -33,10 +33,10 @@ +{ + if (unset || !strcmp(arg, "abort")) + reencode_mode = REENCODE_ABORT; -+ else if (!strcmp(arg, "yes")) -+ reencode_mode = REENCODE_PLEASE; -+ else if (!strcmp(arg, "no")) -+ reencode_mode = REENCODE_NEVER; ++ else if (!strcmp(arg, "yes") || !strcmp(arg, "true") || !strcmp(arg, "on")) ++ reencode_mode = REENCODE_YES; ++ else if (!strcmp(arg, "no") || !strcmp(arg, "false") || !strcmp(arg, "off")) ++ reencode_mode = REENCODE_NO; + else + return error("Unknown reencoding mode: %s", arg); + return 0; @@ -56,14 +56,14 @@ - reencoded = reencode_string(message, "UTF-8", encoding); + } else if (encoding) { + switch(reencode_mode) { -+ case REENCODE_PLEASE: ++ case REENCODE_YES: + reencoded = reencode_string(message, "UTF-8", encoding); + break; -+ case REENCODE_NEVER: ++ case REENCODE_NO: + break; + case REENCODE_ABORT: + die("Encountered commit-specific encoding %s in commit " -+ "%s; use --reencode=<mode> to handle it", ++ "%s; use --reencode=[yes|no] to handle it", + encoding, oid_to_hex(&commit->object.oid)); + } + } @@ -126,13 +126,14 @@ + git fast-import && + # The commit object, if not re-encoded, is 240 bytes. + # Removing the "encoding iso-8859-7\n" header would drops 20 -+ # bytes. Re-encoding the Pi character from \xF0 in -+ # iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would -+ # grep for the # specific bytes, but Windows lamely does not -+ # allow that, so just search for the expected size. ++ # bytes. Re-encoding the Pi character from \xF0 (\360) in ++ # iso-8859-7 to \xCF\x80 (\317\200) in utf-8 adds a byte. ++ # Check for the expected size... + test 240 -eq "$(git cat-file -s i18n-no-recoding)" && -+ # Also make sure the commit has the "encoding" header ++ # ...as well as the expected byte. + git cat-file commit i18n-no-recoding >actual && ++ grep $(printf "\360") actual && ++ # Also make sure the commit has the "encoding" header + grep ^encoding actual) +' + -- 2.21.0.782.g571613a09e