[PATCH v3 10/13] read_directory: avoid invoking exclude machinery on tracked files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



read_directory() (and its friendly wrapper fill_directory) collects
untracked/ignored files by traversing through the whole worktree,
feeding every entry to treat_one_path(), where each entry is checked
against .gitignore patterns.

One may see that tracked files can't be excluded and we do not need to
run them through exclude machinery. On repos where there are many
.gitignore patterns and/or a lot of tracked files, this unnecessary
processing can become expensive.

This patch avoids it mostly for normal cases. Directories are still
processed as before. DIR_SHOW_IGNORED and DIR_COLLECT_IGNORED are not
normally used unless some options are given (e.g. "checkout
--overwrite-ignore", "add -f"...)

treat_one_path's behavior changes when taking this shortcut. With
current code, when a non-directory path is not excluded,
treat_one_path calls treat_file, which returns the initial value of
exclude_file and causes treat_one_path to return path_handled. With
this patch, on the same conditions, treat_one_path returns
path_ignored.

read_directory_recursive() cares about this difference. Check out the
snippet:

	while (...) {
		switch (treat_path(...)) {
		case path_ignored:
			continue;
		case path_handled:
			break;
		}
		contents++;
		if (check_only)
			break;
		dir_add_name(dir, path.buf, path.len);
	}

If path_handled is returned, contents goes up. And if check_only is
true, the loop could be broken early. These will not happen when
treat_one_path (and its wrapper treat_path) returns
path_ignored. dir_add_name internally does a cache_name_exists() check
so it makes no difference.

To avoid this behavior change, treat_one_path is instructed to skip
the optimization when check_only or contents is used.

Finally some numbers (best of 20 runs) that shows why it's worth all
the hassle:

git status   | webkit linux-2.6 libreoffice-core gentoo-x86
-------------+----------------------------------------------
before       | 1.097s    0.208s           0.399s     0.539s
after        | 0.736s    0.159s           0.248s     0.501s
nr. patterns |    89       376               19          0
nr. tracked  |   182k       40k              63k       101k

treat_leading_path:   0.000  0.000
read_directory:       2.879  1.299
+treat_one_path:      1.620  0.599
++is_excluded:        1.416  0.103
+++prep_exclude:      0.198  0.040
+++matching:          0.904  0.036
++dir_exist:          0.035  0.036
++index_name_exists:  0.289  0.291
lazy_init_name_hash:  0.257  0.257
+simplify_away:       0.085  0.082
+dir_add_name:        0.446  0.000

Tracked-down-by: Karsten Blees <karsten.blees@xxxxxxxxx>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx>

---
 dir.c | 80 ++++++++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 27 deletions(-)

diff --git a/dir.c b/dir.c
index c57bf06..6809dd2 100644
--- a/dir.c
+++ b/dir.c
@@ -43,8 +43,11 @@ struct path_simplify {
 	const char *path;
 };
 
-static int read_directory_recursive(struct dir_struct *dir, const char *path, int len,
-	int check_only, const struct path_simplify *simplify);
+static void read_directory_recursive(struct dir_struct *dir,
+				     const char *path, int len,
+				     int check_only,
+				     const struct path_simplify *simplify,
+				     int *contents);
 static int get_dtype(struct dirent *de, const char *path, int len);
 
 static inline int memequal_icase(const char *a, const char *b, int n)
@@ -1184,7 +1187,7 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
 	const char *dirname, int len, int exclude,
 	const struct path_simplify *simplify)
 {
-	int ret;
+	int contents = 0, ret;
 	START_CLOCK();
 	/* The "len-1" is to strip the final '/' */
 	ret = directory_exists_in_index(dirname, len-1);
@@ -1219,19 +1222,19 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
 	 * check if it contains only ignored files
 	 */
 	if ((dir->flags & DIR_SHOW_IGNORED) && !exclude) {
-		int ignored;
 		dir->flags &= ~DIR_SHOW_IGNORED;
 		dir->flags |= DIR_HIDE_EMPTY_DIRECTORIES;
-		ignored = read_directory_recursive(dir, dirname, len, 1, simplify);
+		read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
 		dir->flags &= ~DIR_HIDE_EMPTY_DIRECTORIES;
 		dir->flags |= DIR_SHOW_IGNORED;
 
-		return ignored ? ignore_directory : show_directory;
+		return contents ? ignore_directory : show_directory;
 	}
 	if (!(dir->flags & DIR_SHOW_IGNORED) &&
 	    !(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES))
 		return show_directory;
-	if (!read_directory_recursive(dir, dirname, len, 1, simplify))
+	read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
+	if (!contents)
 		return ignore_directory;
 	return show_directory;
 }
@@ -1398,10 +1401,26 @@ enum path_treatment {
 static enum path_treatment treat_one_path(struct dir_struct *dir,
 					  struct strbuf *path,
 					  const struct path_simplify *simplify,
-					  int dtype, struct dirent *de)
+					  int dtype, struct dirent *de,
+					  int exclude_shortcut_ok)
 {
 	int exclude;
 
+	if (dtype == DT_UNKNOWN)
+		dtype = get_dtype(de, path->buf, path->len);
+
+	if (exclude_shortcut_ok &&
+	    !(dir->flags & DIR_SHOW_IGNORED) &&
+	    !(dir->flags & DIR_COLLECT_IGNORED) &&
+	    dtype != DT_DIR) {
+		struct cache_entry *ce;
+		START_CLOCK();
+		ce = cache_name_exists(path->buf, path->len, ignore_case);
+		STOP_CLOCK(tv_index_name_exists);
+		if (ce)
+			return path_ignored;
+	}
+
 	START_CLOCK();
 	exclude = is_excluded(dir, path->buf, path->len, &dtype);
 	STOP_CLOCK(tv_is_excluded);
@@ -1417,9 +1436,6 @@ static enum path_treatment treat_one_path(struct dir_struct *dir,
 	if (exclude && !(dir->flags & DIR_SHOW_IGNORED))
 		return path_ignored;
 
-	if (dtype == DT_UNKNOWN)
-		dtype = get_dtype(de, path->buf, path->len);
-
 	switch (dtype) {
 	default:
 		return path_ignored;
@@ -1451,7 +1467,8 @@ static enum path_treatment treat_path(struct dir_struct *dir,
 				      struct dirent *de,
 				      struct strbuf *path,
 				      int baselen,
-				      const struct path_simplify *simplify)
+				      const struct path_simplify *simplify,
+				      int exclude_shortcut_ok)
 {
 	int dtype, ret;
 
@@ -1467,7 +1484,7 @@ static enum path_treatment treat_path(struct dir_struct *dir,
 
 	dtype = DTYPE(de);
 	START_CLOCK();
-	ret = treat_one_path(dir, path, simplify, dtype, de);
+	ret = treat_one_path(dir, path, simplify, dtype, de, exclude_shortcut_ok);
 	STOP_CLOCK(tv_treat_one_path);
 	return ret;
 }
@@ -1481,13 +1498,13 @@ static enum path_treatment treat_path(struct dir_struct *dir,
  * Also, we ignore the name ".git" (even if it is not a directory).
  * That likely will not change.
  */
-static int read_directory_recursive(struct dir_struct *dir,
-				    const char *base, int baselen,
-				    int check_only,
-				    const struct path_simplify *simplify)
+static void read_directory_recursive(struct dir_struct *dir,
+				     const char *base, int baselen,
+				     int check_only,
+				     const struct path_simplify *simplify,
+				     int *contents)
 {
 	DIR *fdir;
-	int contents = 0;
 	struct dirent *de;
 	struct strbuf path = STRBUF_INIT;
 
@@ -1499,18 +1516,29 @@ static int read_directory_recursive(struct dir_struct *dir,
 
 	dir->exclude_prepared = 0;
 	while ((de = readdir(fdir)) != NULL) {
-		switch (treat_path(dir, de, &path, baselen, simplify)) {
+		switch (treat_path(dir, de, &path, baselen,
+				   simplify,
+				   !check_only && !contents)) {
 		case path_recurse:
-			contents += read_directory_recursive(dir, path.buf,
-							     path.len, 0,
-							     simplify);
+			read_directory_recursive(dir, path.buf,
+						 path.len, 0,
+						 simplify,
+						 contents);
 			continue;
 		case path_ignored:
 			continue;
 		case path_handled:
 			break;
 		}
-		contents++;
+		/*
+		 * Update the last argument to treat_path if anything
+		 * else is done after this point. This is because if
+		 * treat_path's exclude_shortcut_ok is true, it may
+		 * incorrectly return path_ignored (and never reaches
+		 * this part) instead of path_handled.
+		 */
+		if (contents)
+			(*contents)++;
 		if (check_only)
 			break;
 		START_CLOCK();
@@ -1521,8 +1549,6 @@ static int read_directory_recursive(struct dir_struct *dir,
  out:
 	dir->exclude_prepared = 0;
 	strbuf_release(&path);
-
-	return contents;
 }
 
 static int cmp_name(const void *p1, const void *p2)
@@ -1593,7 +1619,7 @@ static int treat_leading_path(struct dir_struct *dir,
 			break;
 		dir->exclude_prepared = 0;
 		if (treat_one_path(dir, &sb, simplify,
-				   DT_DIR, NULL) == path_ignored)
+				   DT_DIR, NULL, 0) == path_ignored)
 			break; /* do not recurse into it */
 		if (len <= baselen) {
 			rc = 1;
@@ -1621,7 +1647,7 @@ int read_directory(struct dir_struct *dir, const char *path, int len, const char
 		STOP_CLOCK(tv_lazy_init_name_hash);
 #endif
 		START_CLOCK();
-		read_directory_recursive(dir, path, len, 0, simplify);
+		read_directory_recursive(dir, path, len, 0, simplify, NULL);
 		STOP_CLOCK(tv_read_directory);
 	}
 #ifdef MEASURE_EXCLUDE
-- 
1.8.1.2.536.gf441e6d

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]