Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> writes: > It doesn't matter. Since we do the line-by-line thing, the input is always > so short that DFA vs NFA vs BM vs other-clever-search doesn't matter. > There is no scaling - the grep buffer tends to be too small for the > algorithm to matter. > > And the reason we do things line-by-line is that we need to then output > things line-per-line. Here is an experimental patch; first, some numbers (hot cache best of 5 runs). (baseline -- builtin grep) $ time git grep --no-ext-grep qwerty drivers/char/keyboard.c: "qwertyuiop[]\r\000as"... real 0m1.521s user 0m1.260s sys 0m0.256s (baseline -- external grep) $ time git grep --ext-grep qwerty drivers/char/keyboard.c: "qwertyuiop[]\r\000as"... real 0m0.773s user 0m0.416s sys 0m0.376s (with experimental patch -- builtin grep) $ time ../git.git/git grep --no-ext-grep qwerty drivers/char/keyboard.c: "qwertyuiop[]\r\000as"... real 0m0.692s user 0m0.440s sys 0m0.252s The idea is to see the earliest location in the entire remainder of the buffer the pattern(s) we are looking for first appears, and skip all the lines before that point. For this optimization to be valid, we should not be looking for anything complex (e.g. "find lines that have both X and Y but not Z" is out). We cannot use the optimization when "-v" is given, either, because then we need to find the first line that does _not_ match given set of patterns. --- grep.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 64 insertions(+), 0 deletions(-) diff --git a/grep.c b/grep.c index bdadf2c..940e200 100644 --- a/grep.c +++ b/grep.c @@ -615,6 +615,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf, } } +static int should_lookahead(struct grep_opt *opt) +{ + struct grep_pat *p; + + if (opt->extended) + return 0; /* punt for too complex stuff */ + if (opt->invert || opt->unmatch_name_only) + return 0; + for (p = opt->pattern_list; p; p = p->next) { + if (p->token != GREP_PATTERN) + return 0; /* punt for "header only" and stuff */ + } + return 1; +} + +static int look_ahead(struct grep_opt *opt, + unsigned long *left_p, + unsigned *lno_p, + char **bol_p) +{ + unsigned lno = *lno_p; + char *bol = *bol_p; + struct grep_pat *p; + char *sp, *last_bol; + regoff_t earliest = -1; + + for (p = opt->pattern_list; p; p = p->next) { + int hit; + regmatch_t m; + + if (p->fixed) + hit = !fixmatch(p->pattern, bol, p->ignore_case, &m); + else + hit = !regexec(&p->regexp, bol, 1, &m, 0); + if (!hit || m.rm_so < 0 || m.rm_eo < 0) + continue; + if (earliest < 0 || m.rm_so < earliest) + earliest = m.rm_so; + } + + if (earliest < 0) { + *bol_p = bol + *left_p; + *left_p = 0; + return 1; + } + for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--) + ; /* find the beginning of the line */ + last_bol = sp; + + for (sp = bol; sp < last_bol; sp++) { + if (*sp == '\n') + lno++; + } + *left_p -= last_bol - bol; + *bol_p = last_bol; + *lno_p = lno; + return 0; +} + static int grep_buffer_1(struct grep_opt *opt, const char *name, char *buf, unsigned long size, int collect_hits) { @@ -624,6 +683,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, unsigned last_hit = 0; int binary_match_only = 0; unsigned count = 0; + int try_lookahead = 0; enum grep_context ctx = GREP_CONTEXT_HEAD; xdemitconf_t xecfg; @@ -652,11 +712,15 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name, opt->priv = &xecfg; } } + try_lookahead = should_lookahead(opt); while (left) { char *eol, ch; int hit; + if (try_lookahead + && look_ahead(opt, &left, &lno, &bol)) + break; eol = end_of_line(bol, &left); ch = *eol; *eol = 0; -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html