[PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



.gitignore files are spread over directories (*) so that when we check
for ignored files at foo, we are not bothered by foo/bar/.gitignore,
which contains ignore rules for foo/bar only.

This is not enough. foo/.gitignore can contain the pattern
"foo/bar/*.c". When we stay at foo, we know that the pattern cannot
match anything. Similarly, the pattern "/autom4te.cache" at root
directory cannot match anything in foo. This patch attempts to filter
out such patterns to drive down matching cost.

The algorithm implemented here is a naive one. Patterns can be either
active or passive:

 - When we enter a new directory (e.g. from root to foo), currently
   active patterns may no longer be applicable and can be turned to
   passive.

 - On the opposite, when we leave a directory (foo back to roo),
   passive patterns may come alive again.

We could do smarter things. But this implementation cuts a big portion
of cost already (and solves the "root .gitignore is evil" problem).
There's probably no need to be smart.

(*) this design forces us to try to find .gitignore at every
directory. On webkit.git that equals to 6k open syscalls. It feels
like ".svn on every directory" again. I suggest we add
~/.gitignore.master, containing the list .gitignore files in
worktree. If this file exists, we don't poke at every directory for
.gitignore.

treat_leading_path:   0.000  0.000
read_directory:       3.455  2.879
+treat_one_path:      2.203  1.620
++is_excluded:        2.000  1.416
+++prep_exclude:      0.171  0.198
+++matching:          1.509  0.904
++dir_exist:          0.036  0.035
++index_name_exists:  0.292  0.289
lazy_init_name_hash:  0.257  0.257
+simplify_away:       0.084  0.085
+dir_add_name:        0.446  0.446

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 dir.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 dir.h |  1 +
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/dir.c b/dir.c
index 932fd2f..c57bf06 100644
--- a/dir.c
+++ b/dir.c
@@ -458,7 +458,7 @@ void add_exclude(const char *string, const char *base,
 	x->base = base;
 	x->baselen = baselen;
 	x->pattern_baselen = pattern_baselen;
-	x->flags = flags;
+	x->flags = flags | EXC_FLAG_ACTIVE;
 	x->srcpos = srcpos;
 	ALLOC_GROW(el->excludes, el->nr + 1, el->alloc);
 	el->excludes[el->nr++] = x;
@@ -591,6 +591,87 @@ void add_excludes_from_file(struct dir_struct *dir, const char *fname)
 		die("cannot use %s as an exclude file", fname);
 }
 
+static int pattern_match_base(struct dir_struct *dir,
+			      const char *base, int baselen,
+			      const struct exclude *exc)
+{
+	const char *pattern;
+
+	/*
+	 * TODO: if a patterns come from a .gitignore, exc->base would
+	 * be the same for all of them. We could compare once and
+	 * reuse the result, instead of perform the comparison per
+	 * pattern like this.
+	 */
+	if (exc->baselen) {
+		if (baselen < exc->baselen + 1)
+			return 0;
+
+		if (base[exc->baselen] != '/' ||
+		    memcmp(base, exc->base, exc->baselen))
+			return 0;
+
+		base += exc->baselen + 1;
+		baselen -= exc->baselen + 1;
+	}
+
+	if (baselen != exc->pattern_baselen)
+		return 0;
+
+	if (exc->pattern_baselen) {
+		pattern = exc->pattern;
+		if (*pattern == '/')
+			pattern++;
+		if (memcmp(base, pattern, exc->pattern_baselen))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * If pushed is non-zero, we have entered a new directory. Some
+ * pathname patterns may no longer applicable. Go over all active
+ * patterns and disable them if so.
+ *
+ * If popped is non-zero, we have left a directory. Inactive patterns
+ * may be applicable again. Go over them and re-enable if so.
+ */
+static void scan_patterns(struct dir_struct *dir,
+			  const char *base, int baselen,
+			  int pushed, int popped)
+{
+	int i, j, k;
+
+	for (i = EXC_CMDL; i <= EXC_FILE; i++) {
+		struct exclude_list_group *group = &dir->exclude_list_group[i];
+		for (j = group->nr - 1; j >= 0; j--) {
+			struct exclude_list *list = &group->el[j];
+			for (k = 0; k < list->nr; k++) {
+				struct exclude *exc = list->excludes[k];
+
+				/*
+				 * No base (i.e. EXC_FLAG_NODIR) or
+				 * applicable to many bases ("**"
+				 * patterns)
+				 */
+				if (exc->pattern_baselen == -1)
+					continue;
+
+				if (exc->flags & EXC_FLAG_ACTIVE) {
+					if (pushed &&
+					    !pattern_match_base(dir, base, baselen, exc))
+						exc->flags &= ~EXC_FLAG_ACTIVE;
+				} else {
+					if (popped &&
+					    pattern_match_base(dir, base, baselen, exc))
+						exc->flags |= EXC_FLAG_ACTIVE;
+				}
+			}
+		}
+	}
+}
+
 /*
  * Loads the per-directory exclude list for the substring of base
  * which has a char length of baselen.
@@ -600,7 +681,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
 	struct exclude_list_group *group;
 	struct exclude_list *el;
 	struct exclude_stack *stk = NULL;
-	int current;
+	int current, popped = 0, pushed = 0;
 
 	if ((!dir->exclude_per_dir) ||
 	    (baselen + strlen(dir->exclude_per_dir) >= PATH_MAX))
@@ -621,6 +702,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
 		clear_exclude_list(el);
 		free(stk);
 		group->nr--;
+		popped++;
 	}
 
 	/* Read from the parent directories and push them down. */
@@ -659,8 +741,12 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
 					       el, 1);
 		dir->exclude_stack = stk;
 		current = stk->baselen;
+		pushed++;
 	}
 	dir->basebuf[baselen] = '\0';
+
+	if (pushed | popped)
+		scan_patterns(dir, base, baselen, pushed, popped);
 }
 
 int match_basename(const char *basename, int basenamelen,
@@ -755,6 +841,9 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname,
 		const char *exclude = x->pattern;
 		int prefix = x->nowildcardlen;
 
+		if (!(x->flags & EXC_FLAG_ACTIVE))
+			continue;
+
 		if (x->flags & EXC_FLAG_MUSTBEDIR) {
 			if (*dtype == DT_UNKNOWN)
 				*dtype = get_dtype(NULL, pathname, pathlen);
diff --git a/dir.h b/dir.h
index cb50a85..247bfda 100644
--- a/dir.h
+++ b/dir.h
@@ -14,6 +14,7 @@ struct dir_entry {
 #define EXC_FLAG_ENDSWITH 4
 #define EXC_FLAG_MUSTBEDIR 8
 #define EXC_FLAG_NEGATIVE 16
+#define EXC_FLAG_ACTIVE 32
 
 /*
  * Each excludes file will be parsed into a fresh exclude_list which
-- 
1.8.1.2.536.gf441e6d

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]