Am 31.03.23 um 22:44 schrieb René Scharfe: > Am 30.03.23 um 09:55 schrieb Diomidis Spinellis: >> On 30-Mar-23 1:55, Eric Sunshine wrote: >>> I'm encountering a failure on macOS High Sierra 10.13.6 when using >>> --color-words: >> >> The built-in word separation regular expression pattern for the Perl language fails to work with the macOS regex engine. The same also happens with the FreeBSD one (tested on 14.0). >> >> The issue can be replicated through the following sequence of commands. >> >> git init color-words >> cd color-words >> echo '*.pl diff=perl' >.gitattributes >> echo 'print 42;' >t.pl >> git add t.pl >> git commit -am Add >> git show --color-words > > Or in Git's own repo: > > $ git log -p --color-words --no-merges '*.c' > Schwerwiegend: invalid regular expression: [a-zA-Z_][a-zA-Z0-9_]*|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*|0[xXbB][0-9a-fA-F]+[lLuU]*|\.[0-9][0-9]*([Ee][-+]?[0-9]+)?[fFlL]?|[-+*/<>%&^|=!]=|--|\+\+|<<=?|>>=?|&&|\|\||::|->\*?|\.\*|<=>|[^[:space:]]|[<C0>-<FF>][<80>-<BF>]+ > commit 14b9a044798ebb3858a1f1a1377309a3d6054ac8 > [...] > > The error disappears when localization is turned off: > > $ LANG=C git log -p --color-words --no-merges '*.c' >/dev/null > # just finishes without an error > > The issue also vanishes when the "|[\xc0-\xff][\x80-\xbf]+" part is > removed that the macros PATTERNS and IPATTERN in userdiff.c append. > > So it seems regcomp(1) on macOS doesn't like invalid Unicode characters > unless it's in ASCII mode (LANG=C). 664d44ee7f (userdiff: simplify > word-diff safeguard, 2011-01-11) explains that this part exists to match > a multi-byte UTF-8 character. With a regcomp(1) that supports > multi-byte characters natively they need to be specified differently, I > guess, perhaps like this "[^\x00-\x7f]"? Actually we can drop the "|[\xc0-\xff][\x80-\xbf]+" part in that case because the "[^[:space:]]" suffices. And we probably need to do that at runtime because it depends on the locale. The rather elaborate patch below does that. It leaks the truncated word_regex, which isn't that bad because it's done only once per run, but certainly untidy. I suspect/hope this can be done simpler and cleaner after refactoring the userdiff code to allow for runtime assembly of regular expressions. And it's regcomp(3), or rather regexec(3), not regcomp(1). --- userdiff.c | 38 ++++++++++++++++++++++++++++++++++++-- userdiff.h | 2 ++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/userdiff.c b/userdiff.c index 09203fbc35..aa2cd150ba 100644 --- a/userdiff.c +++ b/userdiff.c @@ -9,6 +9,8 @@ static struct userdiff_driver *drivers; static int ndrivers; static int drivers_alloc; +#define OR_MULTI_BYTE_CHAR "|[\xc0-\xff][\x80-\xbf]+" + #define PATTERNS(lang, rx, wrx) { \ .name = lang, \ .binary = -1, \ @@ -16,7 +18,9 @@ static int drivers_alloc; .pattern = rx, \ .cflags = REG_EXTENDED, \ }, \ - .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \ + .word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \ + .is_builtin = 1, \ + .has_multi_byte_char_fallback = 1, \ } #define IPATTERN(lang, rx, wrx) { \ .name = lang, \ @@ -25,7 +29,9 @@ static int drivers_alloc; .pattern = rx, \ .cflags = REG_EXTENDED | REG_ICASE, \ }, \ - .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \ + .word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \ + .is_builtin = 1, \ + .has_multi_byte_char_fallback = 1, \ } /* @@ -330,6 +336,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver, return 0; } +static int regexec_support_multi_byte_chars(void) +{ + static const char not_space[] = "[^[:space:]]"; + static const char utf8_multi_byte_char[] = "\xc2\xa3"; + regex_t re; + regmatch_t match; + static int result = -1; + + if (result != -1) + return result; + if (regcomp(&re, not_space, REG_EXTENDED)) + BUG("invalid regular expression: %s", not_space); + result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) && + match.rm_so == 0 && + match.rm_eo == strlen(utf8_multi_byte_char); + regfree(&re); + return result; +} + static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len) { struct find_by_namelen_data udcbdata = { @@ -337,6 +362,15 @@ static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t .len = len, }; for_each_userdiff_driver(userdiff_find_by_namelen_cb, &udcbdata); + if (udcbdata.driver && + udcbdata.driver->is_builtin && + udcbdata.driver->has_multi_byte_char_fallback && + regexec_support_multi_byte_chars()) { + const char *word_regex = udcbdata.driver->word_regex; + udcbdata.driver->word_regex = xmemdupz(word_regex, + strlen(word_regex) - strlen(OR_MULTI_BYTE_CHAR)); + udcbdata.driver->has_multi_byte_char_fallback = 0; + } return udcbdata.driver; } diff --git a/userdiff.h b/userdiff.h index 24419db697..83f5863d58 100644 --- a/userdiff.h +++ b/userdiff.h @@ -21,6 +21,8 @@ struct userdiff_driver { const char *textconv; struct notes_cache *textconv_cache; int textconv_want_cache; + int is_builtin; + int has_multi_byte_char_fallback; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, -- 2.40.0