[PATCH v2] read_directory: avoid invoking exclude machinery on tracked files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



read_directory() (and its friendly wrapper fill_directory) collects
untracked/ignored files by traversing through the whole worktree,
feeding every entry to treat_one_path(), where each entry is checked
against .gitignore patterns.

One may see that tracked files can't be excluded and we do not need to
run them through exclude machinery. On repos where there are many
.gitignore patterns and/or a lot of tracked files, this unnecessary
processing can become expensive.

This patch avoids it mostly for normal cases. Directories are still
processed as before. DIR_SHOW_IGNORED and DIR_COLLECT_IGNORED are not
normally used unless some options are given (e.g. "checkout
--overwrite-ignore", "add -f"...)

treat_one_path's behavior changes when taking this shortcut. With
current code, when a non-directory path is not excluded,
treat_one_path calls treat_file, which returns the initial value of
exclude_file and causes treat_one_path to return path_handled. With
this patch, on the same conditions, treat_one_path returns
path_ignored.

read_directory_recursive() cares about this difference. Check out the
snippet:

	while (...) {
		switch (treat_path(...)) {
		case path_ignored:
			continue;
		case path_handled:
			break;
		}
		contents++;
		if (check_only)
			break;
		dir_add_name(dir, path.buf, path.len);
	}

If path_handled is returned, contents goes up. And if check_only is
true, the loop could be broken early. These will not happen when
treat_one_path (and its wrapper treat_path) returns
path_ignored. dir_add_name internally does a cache_name_exists() check
so it makes no difference.

To avoid this behavior change, treat_one_path is instructed to skip
the optimization when check_only or contents is used.

Finally some numbers (best of 20 runs) that shows why it's worth all
the hassle:

git status   | webkit linux-2.6 libreoffice-core gentoo-x86
-------------+----------------------------------------------
before       | 1.097s    0.208s           0.399s     0.539s
after        | 0.736s    0.159s           0.248s     0.501s
nr. patterns |    89       376               19          0
nr. tracked  |   182k       40k              63k       101k

Tracked-down-by: Karsten Blees <karsten.blees@xxxxxxxxx>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 Instead of relying on the surrounding code happening to not trigger
 the behavior change in treat_one_path, this round ensures such
 triggers will disable the optimization and fall back to normal code
 path.

 There are no big differences in measured numbers, which indicate
 incorrect triggers do not happen, at least in my tests.

 dir.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/dir.c b/dir.c
index 57394e4..a5fe0a0 100644
--- a/dir.c
+++ b/dir.c
@@ -17,8 +17,11 @@ struct path_simplify {
 	const char *path;
 };
 
-static int read_directory_recursive(struct dir_struct *dir, const char *path, int len,
-	int check_only, const struct path_simplify *simplify);
+static void read_directory_recursive(struct dir_struct *dir,
+				     const char *path, int len,
+				     int check_only,
+				     const struct path_simplify *simplify,
+				     int *contents);
 static int get_dtype(struct dirent *de, const char *path, int len);
 
 /* helper string functions with support for the ignore_case flag */
@@ -1034,6 +1037,7 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
 	const char *dirname, int len, int exclude,
 	const struct path_simplify *simplify)
 {
+	int contents = 0;
 	/* The "len-1" is to strip the final '/' */
 	switch (directory_exists_in_index(dirname, len-1)) {
 	case index_directory:
@@ -1065,19 +1069,19 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
 	 * check if it contains only ignored files
 	 */
 	if ((dir->flags & DIR_SHOW_IGNORED) && !exclude) {
-		int ignored;
 		dir->flags &= ~DIR_SHOW_IGNORED;
 		dir->flags |= DIR_HIDE_EMPTY_DIRECTORIES;
-		ignored = read_directory_recursive(dir, dirname, len, 1, simplify);
+		read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
 		dir->flags &= ~DIR_HIDE_EMPTY_DIRECTORIES;
 		dir->flags |= DIR_SHOW_IGNORED;
 
-		return ignored ? ignore_directory : show_directory;
+		return contents ? ignore_directory : show_directory;
 	}
 	if (!(dir->flags & DIR_SHOW_IGNORED) &&
 	    !(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES))
 		return show_directory;
-	if (!read_directory_recursive(dir, dirname, len, 1, simplify))
+	read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
+	if (!contents)
 		return ignore_directory;
 	return show_directory;
 }
@@ -1242,9 +1246,23 @@ enum path_treatment {
 static enum path_treatment treat_one_path(struct dir_struct *dir,
 					  struct strbuf *path,
 					  const struct path_simplify *simplify,
-					  int dtype, struct dirent *de)
+					  int dtype, struct dirent *de,
+					  int exclude_shortcut_ok)
 {
-	int exclude = is_excluded(dir, path->buf, &dtype);
+	int exclude;
+
+	if (dtype == DT_UNKNOWN)
+		dtype = get_dtype(de, path->buf, path->len);
+
+	if (exclude_shortcut_ok &&
+	    !(dir->flags & DIR_SHOW_IGNORED) &&
+	    !(dir->flags & DIR_COLLECT_IGNORED) &&
+	    dtype != DT_DIR &&
+	    cache_name_exists(path->buf, path->len, ignore_case))
+		return path_ignored;
+
+	exclude = is_excluded(dir, path->buf, &dtype);
+
 	if (exclude && (dir->flags & DIR_COLLECT_IGNORED)
 	    && exclude_matches_pathspec(path->buf, path->len, simplify))
 		dir_add_ignored(dir, path->buf, path->len);
@@ -1256,9 +1274,6 @@ static enum path_treatment treat_one_path(struct dir_struct *dir,
 	if (exclude && !(dir->flags & DIR_SHOW_IGNORED))
 		return path_ignored;
 
-	if (dtype == DT_UNKNOWN)
-		dtype = get_dtype(de, path->buf, path->len);
-
 	switch (dtype) {
 	default:
 		return path_ignored;
@@ -1290,7 +1305,8 @@ static enum path_treatment treat_path(struct dir_struct *dir,
 				      struct dirent *de,
 				      struct strbuf *path,
 				      int baselen,
-				      const struct path_simplify *simplify)
+				      const struct path_simplify *simplify,
+				      int exclude_shortcut_ok)
 {
 	int dtype;
 
@@ -1302,7 +1318,7 @@ static enum path_treatment treat_path(struct dir_struct *dir,
 		return path_ignored;
 
 	dtype = DTYPE(de);
-	return treat_one_path(dir, path, simplify, dtype, de);
+	return treat_one_path(dir, path, simplify, dtype, de, exclude_shortcut_ok);
 }
 
 /*
@@ -1314,13 +1330,13 @@ static enum path_treatment treat_path(struct dir_struct *dir,
  * Also, we ignore the name ".git" (even if it is not a directory).
  * That likely will not change.
  */
-static int read_directory_recursive(struct dir_struct *dir,
-				    const char *base, int baselen,
-				    int check_only,
-				    const struct path_simplify *simplify)
+static void read_directory_recursive(struct dir_struct *dir,
+				     const char *base, int baselen,
+				     int check_only,
+				     const struct path_simplify *simplify,
+				     int *contents)
 {
 	DIR *fdir;
-	int contents = 0;
 	struct dirent *de;
 	struct strbuf path = STRBUF_INIT;
 
@@ -1331,18 +1347,29 @@ static int read_directory_recursive(struct dir_struct *dir,
 		goto out;
 
 	while ((de = readdir(fdir)) != NULL) {
-		switch (treat_path(dir, de, &path, baselen, simplify)) {
+		switch (treat_path(dir, de, &path, baselen,
+				   simplify,
+				   !check_only && !contents)) {
 		case path_recurse:
-			contents += read_directory_recursive(dir, path.buf,
-							     path.len, 0,
-							     simplify);
+			read_directory_recursive(dir, path.buf,
+						 path.len, 0,
+						 simplify,
+						 contents);
 			continue;
 		case path_ignored:
 			continue;
 		case path_handled:
 			break;
 		}
-		contents++;
+		/*
+		 * Update the last argument to treat_path if anything
+		 * else is done after this point. This is because if
+		 * treat_path's exclude_shortcut_ok is true, it may
+		 * incorrectly return path_ignored (and never reaches
+		 * this part) instead of path_handled.
+		 */
+		if (contents)
+			(*contents)++;
 		if (check_only)
 			break;
 		dir_add_name(dir, path.buf, path.len);
@@ -1350,8 +1377,6 @@ static int read_directory_recursive(struct dir_struct *dir,
 	closedir(fdir);
  out:
 	strbuf_release(&path);
-
-	return contents;
 }
 
 static int cmp_name(const void *p1, const void *p2)
@@ -1420,7 +1445,7 @@ static int treat_leading_path(struct dir_struct *dir,
 		if (simplify_away(sb.buf, sb.len, simplify))
 			break;
 		if (treat_one_path(dir, &sb, simplify,
-				   DT_DIR, NULL) == path_ignored)
+				   DT_DIR, NULL, 0) == path_ignored)
 			break; /* do not recurse into it */
 		if (len <= baselen) {
 			rc = 1;
@@ -1440,7 +1465,7 @@ int read_directory(struct dir_struct *dir, const char *path, int len, const char
 
 	simplify = create_simplify(pathspec);
 	if (!len || treat_leading_path(dir, path, len, simplify))
-		read_directory_recursive(dir, path, len, 0, simplify);
+		read_directory_recursive(dir, path, len, 0, simplify, NULL);
 	free_simplify(simplify);
 	qsort(dir->entries, dir->nr, sizeof(struct dir_entry *), cmp_name);
 	qsort(dir->ignored, dir->ignored_nr, sizeof(struct dir_entry *), cmp_name);
-- 
1.8.1.2.536.gf441e6d

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]