When git-p4 reads the output from a p4 command, it assumes it will be 100% UTF-8. If even one character in the output of one p4 command is not UTF-8, git-p4 crashes with: File "C:/Program Files/Git/bin/git-p4.py", line 774, in p4CmdList value = value.decode() UnicodeDecodeError: 'utf-8' codec can't decode byte Ox93 in position 42: invalid start byte This is especially a problem for the "git p4 clone ... @all" command, where git-p4 needs to read thousands of changeset descriptions, one of which may have a stray smart quote, causing the whole clone operation to fail. Add a new config setting, allowing git-p4 to try a fallback encoding (for example, "cp1252") and/or use the Unicode replacement character, to prevent the whole program from crashing on such a minor problem. Signed-off-by: Tzadik Vanderhoof <tzadik.vanderhoof@xxxxxxxxx> --- Documentation/git-p4.txt | 9 +++ git-p4.py | 11 +++- t/t9835-git-p4-config-fallback-encoding.sh | 76 ++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100755 t/t9835-git-p4-config-fallback-encoding.sh diff --git a/Documentation/git-p4.txt b/Documentation/git-p4.txt index f89e68b424..86d3ffa644 100644 --- a/Documentation/git-p4.txt +++ b/Documentation/git-p4.txt @@ -638,6 +638,15 @@ git-p4.pathEncoding:: to transcode the paths to UTF-8. As an example, Perforce on Windows often uses "cp1252" to encode path names. +git-p4.fallbackEncoding:: + Perforce changeset descriptions can be stored in any encoding. + Git-p4 first tries to interpret each description as UTF-8. If that + fails, this config allows another encoding to be tried. You can specify, + for example, "cp1252". If git-p4.fallbackEncoding is "replace", UTF-8 will + be used, with invalid UTF-8 characters replaced by the Unicode replacement + character. The default is "none": there is no fallback, and any non UTF-8 + character will cause git-p4 to immediately fail. + git-p4.largeFileSystem:: Specify the system that is used for large (binary) files. Please note that large file systems do not support the 'git p4 submit' command. diff --git a/git-p4.py b/git-p4.py index 09c9e93ac4..202fb01bdf 100755 --- a/git-p4.py +++ b/git-p4.py @@ -771,7 +771,16 @@ def p4CmdList(cmd, stdin=None, stdin_mode='w+b', cb=None, skip_info=False, for key, value in entry.items(): key = key.decode() if isinstance(value, bytes) and not (key in ('data', 'path', 'clientFile') or key.startswith('depotFile')): - value = value.decode() + try: + value = value.decode() + except UnicodeDecodeError: + fallbackEncoding = gitConfig("git-p4.fallbackEncoding").lower() or 'none' + if fallbackEncoding == 'none': + raise Exception("UTF-8 decoding failed. Consider using git config git-p4.fallbackEncoding") + elif fallbackEncoding == 'replace': + value = value.decode(errors='replace') + else: + value = value.decode(encoding=fallbackEncoding) decoded_entry[key] = value # Parse out data if it's an error response if decoded_entry.get('code') == 'error' and 'data' in decoded_entry: diff --git a/t/t9835-git-p4-config-fallback-encoding.sh b/t/t9835-git-p4-config-fallback-encoding.sh new file mode 100755 index 0000000000..ce352c826b --- /dev/null +++ b/t/t9835-git-p4-config-fallback-encoding.sh @@ -0,0 +1,76 @@ +#!/bin/sh + +test_description='test git-p4.fallbackEncoding config' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +. ./lib-git-p4.sh + +# The Windows build of p4 encodes its command-line arguments according to the +# active code page (which defaults to "cp1252"). As a result, "p4 submit -d" causes +# Unicode changeset descriptions to be stored in the Perforce database as cp1252, +# and a subsequent "git p4 clone" attempting to decode these descriptions as UTF-8 +# will raise a UnicodeDecodeError, necessitating the use of the git-p4.fallbackEncoding config. +# +# The Linux build of p4 encodes its command-line arguments as UTF-8, so changeset descriptions +# are stored as UTF-8, and UnicodeDecodeError is never raised by "git p4 clone". + +if test_have_prereq !MINGW,!CYGWIN; then + skip_all='This system is not subject to encoding failures in "git p4 clone"' + test_done +fi + +test_expect_success 'start p4d' ' + start_p4d +' + +test_expect_success 'add cp1252 description' ' + cd "$cli" && + echo file1 >file1 && + p4 add file1 && + p4 submit -d documentación +' + +test_expect_success 'clone fails with git-p4.fallbackEncoding unset' ' + test_might_fail git config --global --unset git-p4.fallbackEncoding && + test_when_finished cleanup_git && + ( + test_must_fail git p4 clone --dest="$git" //depot@all 2>error && + grep "UTF-8 decoding failed. Consider using git config git-p4.fallbackEncoding" error + ) +' +test_expect_success 'clone fails with git-p4.fallbackEncoding set to "none"' ' + git config --global git-p4.fallbackEncoding none && + test_when_finished cleanup_git && + ( + test_must_fail git p4 clone --dest="$git" //depot@all 2>error && + grep "UTF-8 decoding failed. Consider using git config git-p4.fallbackEncoding" error + ) +' + +test_expect_success 'clone succeeds with git-p4.fallbackEncoding set to "cp1252"' ' + git config --global git-p4.fallbackEncoding cp1252 && + test_when_finished cleanup_git && + ( + git p4 clone --dest="$git" //depot@all && + cd "$git" && + git log --oneline >log && + desc=$(head -1 log | cut -d" " -f2) && + test "$desc" = "documentación" + ) +' + +test_expect_success 'clone succeeds with git-p4.fallbackEncoding set to "replace"' ' + git config --global git-p4.fallbackEncoding replace && + test_when_finished cleanup_git && + ( + git p4 clone --dest="$git" //depot@all && + cd "$git" && + git log --oneline >log && + desc=$(head -1 log | cut -d" " -f2) && + test "$desc" = "documentaci�n" + ) +' + +test_done -- 2.31.1