Since acabd2048e (grep: correctly identify utf-8 characters with \{b,w} in -P, 2023-01-08), PCRE2's UCP mode has been enabled when UTF content was expected and therefore muktibyte characters are included as part of all character classes (when defined). note that if the locale used is not UTF enabled (specially: C, POSIX) or uses an extended charmap binary that is not unicode compatible, binary match will be used instead. It was argued that doing so, at least for \d, was not a good idea, as that might not be what the user expected based on its historical meaning and was also slower, and indeed a similar change that was done to GNU grep was reverted and required further tweaks. At that time, PCRE2 didn't have a way to disable UCP's character expansion selectively, but flags to do so, and that will be available in the next release (planned soon) were added, and one of them has been in use by GNU grep since their last release in May (only if built and linked against the prereleased PCRE2 library though). Add flags to make sure that both \d and [:digit:] only include ASCII digits so that `git grep` will be closer to GNU grep and improve performance as a side effect, but add a configuration flag to allow keeping the current behaviour (which is closer to perl and ripgrep). Signed-off-by: Carlo Marcelo Arenas Belón <carenas@xxxxxxxxx> --- Documentation/config/grep.txt | 4 ++++ grep.c | 37 ++++++++++++++++++++++------- grep.h | 5 ++++ t/perf/p7822-grep-perl-character.sh | 11 +++++++-- t/t7818-grep-digit.sh | 32 +++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 11 deletions(-) create mode 100755 t/t7818-grep-digit.sh diff --git a/Documentation/config/grep.txt b/Documentation/config/grep.txt index e521f20390..4e405c8ad1 100644 --- a/Documentation/config/grep.txt +++ b/Documentation/config/grep.txt @@ -26,3 +26,7 @@ grep.fullName:: grep.fallbackToNoIndex:: If set to true, fall back to git grep --no-index if git grep is executed outside of a git repository. Defaults to false. + +grep.perl.digit:: + If set to true, use the perl definitions for \d and [:digit:]. + Defaults to false. diff --git a/grep.c b/grep.c index fc2d0c837a..fec36ccb30 100644 --- a/grep.c +++ b/grep.c @@ -88,6 +88,11 @@ int grep_config(const char *var, const char *value, return 0; } + if (!strcmp(var, "grep.perl.digit")) { + opt->perl_digit = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "color.grep")) opt->color = git_config_colorbool(var, value); if (!strcmp(var, "color.grep.match")) { @@ -301,6 +306,7 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt int patinforet; size_t jitsizearg; int literal = !opt->ignore_case && (p->fixed || p->is_fixed); + uint32_t xoptions = 0; /* * Call pcre2_general_context_create() before calling any @@ -321,17 +327,22 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt } options |= PCRE2_CASELESS; } - if (!opt->ignore_locale && is_utf8_locale() && !literal) - options |= (PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_INVALID_UTF); + if (!opt->ignore_locale && is_utf8_locale() && !literal) { + options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF); -#ifndef GIT_PCRE2_VERSION_10_35_OR_HIGHER - /* - * Work around a JIT bug related to invalid Unicode character handling - * fixed in 10.35: - * https://github.com/PCRE2Project/pcre2/commit/c21bd977547d - */ - options &= ~PCRE2_UCP; +#ifdef GIT_PCRE2_VERSION_10_35_OR_HIGHER + /* + * Work around a JIT bug related to invalid Unicode character handling + * fixed in 10.35: + * https://github.com/PCRE2Project/pcre2/commit/c21bd977547d + */ + options |= PCRE2_UCP; +#ifdef GIT_PCRE2_VERSION_10_43_OR_HIGHER + if (!opt->perl_digit) + xoptions |= (PCRE2_EXTRA_ASCII_BSD | PCRE2_EXTRA_ASCII_DIGIT); #endif +#endif + } #ifndef GIT_PCRE2_VERSION_10_36_OR_HIGHER /* Work around https://bugs.exim.org/show_bug.cgi?id=2642 fixed in 10.36 */ @@ -339,6 +350,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt options |= PCRE2_NO_START_OPTIMIZE; #endif + if (xoptions) { + if (!p->pcre2_compile_context) + p->pcre2_compile_context = pcre2_compile_context_create(p->pcre2_general_context); + + pcre2_set_compile_extra_options(p->pcre2_compile_context, + xoptions); + } + p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern, p->patternlen, options, &error, &erroffset, p->pcre2_compile_context); diff --git a/grep.h b/grep.h index 926c0875c4..cd5c416a0a 100644 --- a/grep.h +++ b/grep.h @@ -4,6 +4,9 @@ #ifdef USE_LIBPCRE2 #define PCRE2_CODE_UNIT_WIDTH 8 #include <pcre2.h> +#if (PCRE2_MAJOR >= 10 && PCRE2_MINOR >= 43) || PCRE2_MAJOR >= 11 +#define GIT_PCRE2_VERSION_10_43_OR_HIGHER +#endif #if (PCRE2_MAJOR >= 10 && PCRE2_MINOR >= 36) || PCRE2_MAJOR >= 11 #define GIT_PCRE2_VERSION_10_36_OR_HIGHER #endif @@ -178,6 +181,8 @@ struct grep_opt { void (*output)(struct grep_opt *opt, const void *data, size_t size); void *output_priv; + + unsigned perl_digit:1; }; #define GREP_OPT_INIT { \ diff --git a/t/perf/p7822-grep-perl-character.sh b/t/perf/p7822-grep-perl-character.sh index 87009c60df..cc88d5a695 100755 --- a/t/perf/p7822-grep-perl-character.sh +++ b/t/perf/p7822-grep-perl-character.sh @@ -8,6 +8,13 @@ etc.) we will test the patterns under those numbers of threads. . ./perf-lib.sh +# setting a LOCALE is needed, but not yet supported by : +#. "$TEST_DIRECTORY"/lib-gettext.sh + +# Invoke like: +# +# LC_ALL=is_IS.utf8 ./p7822-grep-perl-character.sh + test_perf_large_repo test_checkout_worktree @@ -27,13 +34,13 @@ do if ! test_have_prereq PERF_GREP_ENGINES_THREADS then test_perf "grep -P '$pattern'" --prereq PCRE " - git -P grep -f pat || : + git grep -P -f pat || : " else for threads in $GIT_PERF_GREP_THREADS do test_perf "grep -P '$pattern' with $threads threads" --prereq PTHREADS,PCRE " - git -c grep.threads=$threads -P grep -f pat || : + git -c grep.threads=$threads grep -P -f pat || : " done fi diff --git a/t/t7818-grep-digit.sh b/t/t7818-grep-digit.sh new file mode 100755 index 0000000000..44007e6be6 --- /dev/null +++ b/t/t7818-grep-digit.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +test_description='git grep -P with digits' + +TEST_PASSES_SANITIZE_LEAK=true +. ./lib-gettext.sh + +test_expect_success 'setup' ' + echo 2023 >ascii && + printf "\357\274\222\357\274\220\357\274\222\357\274\223\n" >fullwidth && + printf "\331\241\331\244\331\244\331\245\n" >multibyte && + git add . && + git commit -m. && + LC_ALL="$is_IS_locale" && + export LC_ALL +' + +test_expect_success PCRE 'grep -P "\d"' ' + echo "ascii:2023" >expected && + git grep -P "\d{2}[[:digit:]]{2}" >actual && + test_cmp expected actual +' + +test_expect_success PCRE 'git -c grep.perl.digit' ' + test_config grep.perl.digit true && + git grep -P "\d{2}[[:digit:]]{2}" >actual && + grep fullwidth actual && + grep multibyte actual && + test_line_count = 3 actual +' + +test_done -- 2.39.3 (Apple Git-145)