Re: regex compilation error with --color-words

René Scharfe <l.s.r@xxxxxx> · Mon, 3 Apr 2023 21:32:51 +0200

Am 03.04.23 um 18:29 schrieb Junio C Hamano:
> René Scharfe <l.s.r@xxxxxx> writes:
>
>> Actually we can drop the "|[\xc0-\xff][\x80-\xbf]+" part in that case
>> because the "[^[:space:]]" suffices.  And we probably need to do that at
>> runtime because it depends on the locale.  The rather elaborate patch
>> below does that.  It leaks the truncated word_regex, which isn't that
>> bad because it's done only once per run, but certainly untidy.
>
> Small ugliness like what we see below is fine in a technology
> demonostration.
>
>> I suspect/hope this can be done simpler and cleaner after refactoring
>> the userdiff code to allow for runtime assembly of regular expressions.
>
> Do we expect "does the regcomp(3) and regexec(3) correctly match a
> non-space multi-byte UTF-8 sequence as expected?" to be the only
> choices, do we expect we will choose from only two, and do we expect
> that the differences between the MB version and fallback version to
> be the same "OR_MULTI_BYTE_CHAR may be omitted"?  For now I think
> it would be reasonable to answer yes to all three.
>
> How are .is_builtin and .has_multi_byte_char_fallback bits expected
> to be used?  For what kind of files do we expect them to be set
> differently?

.is_builtin is unnecessary here.  It is a remnant of me noticing that we
don't add "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+" to user-defined
patterns, but .has_multi_byte_char_fallback alone suffices.

>
> In the simplest case, I would imagine that we could do this
>
>  	...
>  	const char *word_regex;
> +	const char *word_regex_wo_mb;
>  	const char *textconv;
>  	...
>
> in the definition of "struct userdifif_driver", use
>
>  #define PATTERNS(lang, rx, wrx) { \
>  	...
>  	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
> +	.word_regex_wo_mb = wrx "|[^[:space:]]", \

Ah, nice, no allocation or string manipulation at runtime at all, at
the small cost of having near-duplicate static strings.

>  }
>
> and similar for IPATTERN, and make a non-NULL .word.regex_wo_mb
> serve as the .has_multi_byte_char_fallback bit to trigger "does our
> regex engine do a good job for multi-byte?"
>
> Thanks.
>
>> diff --git a/userdiff.c b/userdiff.c
>> index 09203fbc35..aa2cd150ba 100644
>> --- a/userdiff.c
>> +++ b/userdiff.c
>> @@ -9,6 +9,8 @@ static struct userdiff_driver *drivers;
>>  static int ndrivers;
>>  static int drivers_alloc;
>>
>> +#define OR_MULTI_BYTE_CHAR "|[\xc0-\xff][\x80-\xbf]+"
>> +
>>  #define PATTERNS(lang, rx, wrx) { \
>>  	.name = lang, \
>>  	.binary = -1, \
>> @@ -16,7 +18,9 @@ static int drivers_alloc;
>>  		.pattern = rx, \
>>  		.cflags = REG_EXTENDED, \
>>  	}, \
>> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
>> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
>> +	.is_builtin = 1, \
>> +	.has_multi_byte_char_fallback = 1, \
>>  }
>>  #define IPATTERN(lang, rx, wrx) { \
>>  	.name = lang, \
>> @@ -25,7 +29,9 @@ static int drivers_alloc;
>>  		.pattern = rx, \
>>  		.cflags = REG_EXTENDED | REG_ICASE, \
>>  	}, \
>> -	.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
>> +	.word_regex = wrx "|[^[:space:]]" OR_MULTI_BYTE_CHAR, \
>> +	.is_builtin = 1, \
>> +	.has_multi_byte_char_fallback = 1, \
>>  }
>>
>>  /*
>> @@ -330,6 +336,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
>>  	return 0;
>>  }
>>
>> +static int regexec_support_multi_byte_chars(void)
>> +{
>> +	static const char not_space[] = "[^[:space:]]";
>> +	static const char utf8_multi_byte_char[] = "\xc2\xa3";
>> +	regex_t re;
>> +	regmatch_t match;
>> +	static int result = -1;
>> +
>> +	if (result != -1)
>> +		return result;
>> +	if (regcomp(&re, not_space, REG_EXTENDED))
>> +		BUG("invalid regular expression: %s", not_space);
>> +	result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
>> +		match.rm_so == 0 &&
>> +		match.rm_eo == strlen(utf8_multi_byte_char);
>> +	regfree(&re);
>> +	return result;
>> +}
>> +
>>  static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
>>  {
>>  	struct find_by_namelen_data udcbdata = {
>> @@ -337,6 +362,15 @@ static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t
>>  		.len = len,
>>  	};
>>  	for_each_userdiff_driver(userdiff_find_by_namelen_cb, &udcbdata);
>> +	if (udcbdata.driver &&
>> +	    udcbdata.driver->is_builtin &&
>> +	    udcbdata.driver->has_multi_byte_char_fallback &&
>> +	    regexec_support_multi_byte_chars()) {
>> +		const char *word_regex = udcbdata.driver->word_regex;
>> +		udcbdata.driver->word_regex = xmemdupz(word_regex,
>> +			strlen(word_regex) - strlen(OR_MULTI_BYTE_CHAR));
>> +		udcbdata.driver->has_multi_byte_char_fallback = 0;
>> +	}
>>  	return udcbdata.driver;
>>  }
>>
>> diff --git a/userdiff.h b/userdiff.h
>> index 24419db697..83f5863d58 100644
>> --- a/userdiff.h
>> +++ b/userdiff.h
>> @@ -21,6 +21,8 @@ struct userdiff_driver {
>>  	const char *textconv;
>>  	struct notes_cache *textconv_cache;
>>  	int textconv_want_cache;
>> +	int is_builtin;
>> +	int has_multi_byte_char_fallback;
>>  };
>>  enum userdiff_driver_type {
>>  	USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0,
>> --
>> 2.40.0