Re: [PATCH] grep: do not do external grep on skip-worktree entries

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> writes:

> It doesn't matter. Since we do the line-by-line thing, the input is always 
> so short that DFA vs NFA vs BM vs other-clever-search doesn't matter. 
> There is no scaling - the grep buffer tends to be too small for the 
> algorithm to matter.
>
> And the reason we do things line-by-line is that we need to then output 
> things line-per-line.

Here is an experimental patch; first, some numbers (hot cache best of 5 runs).

(baseline -- builtin grep)

    $ time git grep --no-ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m1.521s
    user    0m1.260s
    sys     0m0.256s

    (baseline -- external grep)

    $ time git grep --ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m0.773s
    user    0m0.416s
    sys     0m0.376s

    (with experimental patch -- builtin grep)

    $ time ../git.git/git grep --no-ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m0.692s
    user    0m0.440s
    sys     0m0.252s

The idea is to see the earliest location in the entire remainder of the
buffer the pattern(s) we are looking for first appears, and skip all the
lines before that point.  For this optimization to be valid, we should not
be looking for anything complex (e.g. "find lines that have both X and Y
but not Z" is out).  We cannot use the optimization when "-v" is given,
either, because then we need to find the first line that does _not_ match
given set of patterns.

---

 grep.c |   64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 64 insertions(+), 0 deletions(-)

diff --git a/grep.c b/grep.c
index bdadf2c..940e200 100644
--- a/grep.c
+++ b/grep.c
@@ -615,6 +615,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
 	}
 }
 
+static int should_lookahead(struct grep_opt *opt)
+{
+	struct grep_pat *p;
+
+	if (opt->extended)
+		return 0; /* punt for too complex stuff */
+	if (opt->invert || opt->unmatch_name_only)
+		return 0;
+	for (p = opt->pattern_list; p; p = p->next) {
+		if (p->token != GREP_PATTERN)
+			return 0; /* punt for "header only" and stuff */
+	}
+	return 1;
+}
+
+static int look_ahead(struct grep_opt *opt,
+		      unsigned long *left_p,
+		      unsigned *lno_p,
+		      char **bol_p)
+{
+	unsigned lno = *lno_p;
+	char *bol = *bol_p;
+	struct grep_pat *p;
+	char *sp, *last_bol;
+	regoff_t earliest = -1;
+
+	for (p = opt->pattern_list; p; p = p->next) {
+		int hit;
+		regmatch_t m;
+
+		if (p->fixed)
+			hit = !fixmatch(p->pattern, bol, p->ignore_case, &m);
+		else
+			hit = !regexec(&p->regexp, bol, 1, &m, 0);
+		if (!hit || m.rm_so < 0 || m.rm_eo < 0)
+			continue;
+		if (earliest < 0 || m.rm_so < earliest)
+			earliest = m.rm_so;
+	}
+
+	if (earliest < 0) {
+		*bol_p = bol + *left_p;
+		*left_p = 0;
+		return 1;
+	}
+	for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
+		; /* find the beginning of the line */
+	last_bol = sp;
+
+	for (sp = bol; sp < last_bol; sp++) {
+		if (*sp == '\n')
+			lno++;
+	}
+	*left_p -= last_bol - bol;
+	*bol_p = last_bol;
+	*lno_p = lno;
+	return 0;
+}
+
 static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			 char *buf, unsigned long size, int collect_hits)
 {
@@ -624,6 +683,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 	unsigned last_hit = 0;
 	int binary_match_only = 0;
 	unsigned count = 0;
+	int try_lookahead = 0;
 	enum grep_context ctx = GREP_CONTEXT_HEAD;
 	xdemitconf_t xecfg;
 
@@ -652,11 +712,15 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			opt->priv = &xecfg;
 		}
 	}
+	try_lookahead = should_lookahead(opt);
 
 	while (left) {
 		char *eol, ch;
 		int hit;
 
+		if (try_lookahead
+		    && look_ahead(opt, &left, &lno, &bol))
+			break;
 		eol = end_of_line(bol, &left);
 		ch = *eol;
 		*eol = 0;
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]