While stress testing `git filter-repo`, I noticed an issue with encoding; further digging led to the fixes and features in this series. See the individual commit messages for details. Changes since v4 (full range-diff below): * Used git_parse_maybe_bool() * Updated Documentation/git-fast-export.txt to document the new option Elijah Newren (5): t9350: fix encoding test to actually test reencoding fast-import: support 'encoding' commit header fast-export: avoid stripping encoding header if we cannot reencode fast-export: differentiate between explicitly utf-8 and implicitly utf-8 fast-export: do automatic reencoding of commit messages only if requested Documentation/git-fast-export.txt | 7 ++ Documentation/git-fast-import.txt | 7 ++ builtin/fast-export.c | 55 ++++++++++++-- fast-import.c | 11 ++- t/t9300-fast-import.sh | 20 +++++ t/t9350-fast-export.sh | 78 +++++++++++++++++--- t/t9350/broken-iso-8859-7-commit-message.txt | 1 + t/t9350/simple-iso-8859-7-commit-message.txt | 1 + 8 files changed, 163 insertions(+), 17 deletions(-) create mode 100644 t/t9350/broken-iso-8859-7-commit-message.txt create mode 100644 t/t9350/simple-iso-8859-7-commit-message.txt Range-diff: 1: 37a68a0ffd = 1: 37a68a0ffd t9350: fix encoding test to actually test reencoding 2: 3d84f4613d = 2: 3d84f4613d fast-import: support 'encoding' commit header 3: baa8394a3a = 3: baa8394a3a fast-export: avoid stripping encoding header if we cannot reencode 4: 49960164c6 = 4: 49960164c6 fast-export: differentiate between explicitly utf-8 and implicitly utf-8 5: 571613a09e ! 5: d8be4ee826 fast-export: do automatic reencoding of commit messages only if requested @@ -13,6 +13,24 @@ Signed-off-by: Elijah Newren <newren@xxxxxxxxx> + diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt + --- a/Documentation/git-fast-export.txt + +++ b/Documentation/git-fast-export.txt +@@ + for intermediary filters (e.g. for rewriting commit messages + which refer to older commits, or for stripping blobs by id). + ++--reencode=(yes|no|abort):: ++ Specify how to handle `encoding` header in commit objects. When ++ asking to 'abort' (which is the default), this program will die ++ when encountering such a commit object. With 'yes', the commit ++ message will be reencoded into UTF-8. With 'no', the original ++ encoding will be preserved. ++ + --refspec:: + Apply the specified refspec to each ref exported. Multiple of them can + be specified. + diff --git a/builtin/fast-export.c b/builtin/fast-export.c --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -31,14 +49,25 @@ +static int parse_opt_reencode_mode(const struct option *opt, + const char *arg, int unset) +{ -+ if (unset || !strcmp(arg, "abort")) ++ if (unset) { + reencode_mode = REENCODE_ABORT; -+ else if (!strcmp(arg, "yes") || !strcmp(arg, "true") || !strcmp(arg, "on")) -+ reencode_mode = REENCODE_YES; -+ else if (!strcmp(arg, "no") || !strcmp(arg, "false") || !strcmp(arg, "off")) ++ return 0; ++ } ++ ++ switch (git_parse_maybe_bool(arg)) { ++ case 0: + reencode_mode = REENCODE_NO; -+ else -+ return error("Unknown reencoding mode: %s", arg); ++ break; ++ case 1: ++ reencode_mode = REENCODE_YES; ++ break; ++ default: ++ if (arg && !strcasecmp(arg, "abort")) ++ reencode_mode = REENCODE_ABORT; ++ else ++ return error("Unknown reencoding mode: %s", arg); ++ } ++ + return 0; +} + -- 2.21.0.782.gd8be4ee826