Re: regex compilation error with --color-words

Junio C Hamano <gitster@xxxxxxxxx> · Mon, 03 Apr 2023 09:29:20 -0700

René Scharfe <l.s.r@xxxxxx> writes:

> Actually we can drop the "|[\xc0-\xff][\x80-\xbf]+" part in that case
> because the "[^[:space:]]" suffices.  And we probably need to do that at
> runtime because it depends on the locale.  The rather elaborate patch
> below does that.  It leaks the truncated word_regex, which isn't that
> bad because it's done only once per run, but certainly untidy.

Small ugliness like what we see below is fine in a technology
demonostration.

> I suspect/hope this can be done simpler and cleaner after refactoring
> the userdiff code to allow for runtime assembly of regular expressions.

Do we expect "does the regcomp(3) and regexec(3) correctly match a
non-space multi-byte UTF-8 sequence as expected?" to be the only
choices, do we expect we will choose from only two, and do we expect
that the differences between the MB version and fallback version to
be the same "OR_MULTI_BYTE_CHAR may be omitted"?  For now I think
it would be reasonable to answer yes to all three.

How are .is_builtin and .has_multi_byte_char_fallback bits expected
to be used?  For what kind of files do we expect them to be set
differently?

In the simplest case, I would imagine that we could do this

 	...
 	const char *word_regex;
+	const char *word_regex_wo_mb;
 	const char *textconv;
 	...

in the definition of "struct userdifif_driver", use

 #define PATTERNS(lang, rx, wrx) { \
 	...
 	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
+	.word_regex_wo_mb = wrx "|[^[:space:]]", \
 }

and similar for IPATTERN, and make a non-NULL .word.regex_wo_mb
serve as the .has_multi_byte_char_fallback bit to trigger "does our
regex engine do a good job for multi-byte?"

Thanks.

> diff --git a/userdiff.c b/userdiff.c
> index 09203fbc35..aa2cd150ba 100644
> --- a/userdiff.c
> +++ b/userdiff.c
> @@ -9,6 +9,8 @@ static struct userdiff_driver *drivers;
>  static int ndrivers;
>  static int drivers_alloc;
>
> +#define OR_MULTI_BYTE_CHAR "|[\xc0-\xff][\x80-\xbf]+"
> +
>  #define PATTERNS(lang, rx, wrx) { \
>  	.name = lang, \
>  	.binary = -1, \
> @@ -16,7 +18,9 @@ static int drivers_alloc;
>  		.pattern = rx, \
>  		.cflags = REG_EXTENDED, \
>  	}, \
> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
> +	.is_builtin = 1, \
> +	.has_multi_byte_char_fallback = 1, \
>  }
>  #define IPATTERN(lang, rx, wrx) { \
>  	.name = lang, \
> @@ -25,7 +29,9 @@ static int drivers_alloc;
>  		.pattern = rx, \
>  		.cflags = REG_EXTENDED | REG_ICASE, \
>  	}, \
> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
> +	.is_builtin = 1, \
> +	.has_multi_byte_char_fallback = 1, \
>  }
>
>  /*
> @@ -330,6 +336,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
>  	return 0;
>  }
>
> +static int regexec_support_multi_byte_chars(void)
> +{
> +	static const char not_space[] = "[^[:space:]]";
> +	static const char utf8_multi_byte_char[] = "\xc2\xa3";
> +	regex_t re;
> +	regmatch_t match;
> +	static int result = -1;
> +
> +	if (result != -1)
> +		return result;
> +	if (regcomp(&re, not_space, REG_EXTENDED))
> +		BUG("invalid regular expression: %s", not_space);
> +	result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
> +		match.rm_so == 0 &&
> +		match.rm_eo == strlen(utf8_multi_byte_char);
> +	regfree(&re);
> +	return result;
> +}
> +
>  static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
>  {
>  	struct find_by_namelen_data udcbdata = {
> @@ -337,6 +362,15 @@ static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t
>  		.len = len,
>  	};
>  	for_each_userdiff_driver(userdiff_find_by_namelen_cb, &udcbdata);
> +	if (udcbdata.driver &&
> +	    udcbdata.driver->is_builtin &&
> +	    udcbdata.driver->has_multi_byte_char_fallback &&
> +	    regexec_support_multi_byte_chars()) {
> +		const char *word_regex = udcbdata.driver->word_regex;
> +		udcbdata.driver->word_regex = xmemdupz(word_regex,
> +			strlen(word_regex) - strlen(OR_MULTI_BYTE_CHAR));
> +		udcbdata.driver->has_multi_byte_char_fallback = 0;
> +	}
>  	return udcbdata.driver;
>  }
>
> diff --git a/userdiff.h b/userdiff.h
> index 24419db697..83f5863d58 100644
> --- a/userdiff.h
> +++ b/userdiff.h
> @@ -21,6 +21,8 @@ struct userdiff_driver {
>  	const char *textconv;
>  	struct notes_cache *textconv_cache;
>  	int textconv_want_cache;
> +	int is_builtin;
> +	int has_multi_byte_char_fallback;
>  };
>  enum userdiff_driver_type {
>  	USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0,
> --
> 2.40.0