regexec(3) can fail. E.g. on macOS it fails if it is used with an UTF-8 locale to match a valid regex against a buffer containing invalid UTF-8 characters. git grep has two ways to search for matches in a file: Either it splits its contents into lines and matches them separately, or it matches the whole content and figures out line boundaries later. The latter is done by look_ahead() and it's quicker in the common case where most files don't contain a match. Fall back to line-by-line matching if look_ahead() encounters an regexec(3) error by propagating errors out of patmatch() and bailing out of look_ahead() if there is one. This way we at least can find matches in lines that contain only valid characters. That matches the behavior of grep(1) on macOS. pcre2match() dies if pcre2_jit_match() or pcre2_match() fail, but since we use the flag PCRE2_MATCH_INVALID_UTF it handles invalid UTF-8 characters gracefully. So implement the fall-back only for regexec(3) and leave the PCRE2 matching unchanged. Reported-by: David Gstir <david@xxxxxxxxxxxxx> Signed-off-by: René Scharfe <l.s.r@xxxxxx> --- grep.c | 30 ++++++++++++++++++++---------- t/t7810-grep.sh | 9 +++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/grep.c b/grep.c index 701e58de04..506f0423c8 100644 --- a/grep.c +++ b/grep.c @@ -906,15 +906,17 @@ static int patmatch(struct grep_pat *p, const char *line, const char *eol, regmatch_t *match, int eflags) { - int hit; - if (p->pcre2_pattern) - hit = !pcre2match(p, line, eol, match, eflags); - else - hit = !regexec_buf(&p->regexp, line, eol - line, 1, match, - eflags); + return !pcre2match(p, line, eol, match, eflags); - return hit; + switch (regexec_buf(&p->regexp, line, eol - line, 1, match, eflags)) { + case 0: + return 1; + case REG_NOMATCH: + return 0; + default: + return -1; + } } static void strip_timestamp(const char *bol, const char **eol_p) @@ -952,6 +954,8 @@ static int headerless_match_one_pattern(struct grep_pat *p, again: hit = patmatch(p, bol, eol, pmatch, eflags); + if (hit < 0) + hit = 0; if (hit && p->word_regexp) { if ((pmatch[0].rm_so < 0) || @@ -1461,6 +1465,8 @@ static int look_ahead(struct grep_opt *opt, regmatch_t m; hit = patmatch(p, bol, bol + *left_p, &m, 0); + if (hit < 0) + return -1; if (!hit || m.rm_so < 0 || m.rm_eo < 0) continue; if (earliest < 0 || m.rm_so < earliest) @@ -1655,9 +1661,13 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle if (try_lookahead && !(last_hit && (show_function || - lno <= last_hit + opt->post_context)) - && look_ahead(opt, &left, &lno, &bol)) - break; + lno <= last_hit + opt->post_context))) { + hit = look_ahead(opt, &left, &lno, &bol); + if (hit < 0) + try_lookahead = 0; + else if (hit) + break; + } eol = end_of_line(bol, &left); if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol)) diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh index af2cf2f78a..64ac4f04ee 100755 --- a/t/t7810-grep.sh +++ b/t/t7810-grep.sh @@ -87,6 +87,7 @@ test_expect_success setup ' # Still a no-op. function dummy() {} EOF + printf "\200\nASCII\n" >invalid-utf8 && if test_have_prereq FUNNYNAMES then echo unusual >"\"unusual\" pathname" && @@ -534,6 +535,14 @@ do test_cmp expected actual ' + test_expect_success "grep $L searches past invalid lines on UTF-8 locale" ' + LC_ALL=en_US.UTF-8 git grep A. invalid-utf8 >actual && + cat >expected <<-EOF && + invalid-utf8:ASCII + EOF + test_cmp expected actual + ' + test_expect_success FUNNYNAMES "grep $L should quote unusual pathnames" ' cat >expected <<-EOF && ${HC}"\"unusual\" pathname":unusual -- 2.47.0