[RFC PATCH 3/3] grep: don't add submodules to the alternates list

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When grepping with --recurse-submodules, the object directory of the
submodule is added to the in-memory alternates list. This makes git need
to watch out for more packfiles which might bring bad consequences for
memory and performance.

Now that raw_object_store is a member of the struct repository, it's
possible to use the submodules' instances directly, without the need to
add its odbs to the alternates list. So let's instead pass the subrepo
down to the threads and replace function calls which use
the_repository by default for their more general counterparts. Also,
submodule cleanup must be refactored as calling repo_clear() at the end
of grep_submodules(), might free the subrepo's resources before threads
have finished working on it. To do that, let's keep track of the
workers progress and only call repo_clear() when it's safe to do so.

This change will also help future patches to re-enable threads in
non-worktree grep as less mutual exclusions will be needed. E.g.
grep_submodule()'s call to parse_object_or_die() won't conflict with
other calls to the same function by worker threads (as they should be
referencing different 'struct repository's).

Signed-off-by: Matheus Tavares <matheus.bernardino@xxxxxx>
---
 builtin/grep.c | 80 ++++++++++++++++++++++++++++++++++++--------------
 grep.c         | 26 ++++++++--------
 2 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 73ef00c426..0bb58c456b 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -43,6 +43,15 @@ static pthread_t *threads;
  */
 struct work_item {
 	struct grep_source source;
+
+	/*
+	 * Each worker thread is initialized with a 'struct grep_opt *opt' where
+	 * opt->repo points to the_repository. But a work item may refeer to a
+	 * subrepo. So the repository relative to each task is also passed to
+	 * the assigned thread and its opt->repo is updated when retrieving the
+	 * task.
+	 */
+	struct repository *repo;
 	char done;
 	struct strbuf out;
 };
@@ -65,6 +74,13 @@ static int todo_done;
 /* Has all work items been added? */
 static int all_work_added;
 
+/* Tracks the work progress in subrepos to correctly free them in the end */
+struct progress_in_subrepo {
+	struct repository subrepo;
+	int all_work_added;
+	int work_counter;
+};
+
 /* This lock protects all the variables above. */
 static pthread_mutex_t grep_mutex;
 
@@ -93,6 +109,8 @@ static int skip_first_line;
 
 static void add_work(struct grep_opt *opt, const struct grep_source *gs)
 {
+	struct progress_in_subrepo *p;
+
 	grep_lock();
 
 	while ((todo_end+1) % ARRAY_SIZE(todo) == todo_done) {
@@ -104,9 +122,15 @@ static void add_work(struct grep_opt *opt, const struct grep_source *gs)
 		grep_source_load_driver(&todo[todo_end].source,
 					opt->repo->index);
 	todo[todo_end].done = 0;
+	todo[todo_end].repo = opt->repo;
 	strbuf_reset(&todo[todo_end].out);
 	todo_end = (todo_end + 1) % ARRAY_SIZE(todo);
 
+	if (opt->repo != the_repository) {
+		p = container_of(opt->repo, struct progress_in_subrepo, subrepo);
+		p->work_counter++;
+	}
+
 	pthread_cond_signal(&cond_add);
 	grep_unlock();
 }
@@ -132,6 +156,7 @@ static struct work_item *get_work(void)
 
 static void work_done(struct work_item *w)
 {
+	struct progress_in_subrepo *p;
 	int old_done;
 
 	grep_lock();
@@ -156,6 +181,17 @@ static void work_done(struct work_item *w)
 
 			write_or_die(1, p, len);
 		}
+
+		if (w->repo != the_repository) {
+			p = container_of(w->repo, struct progress_in_subrepo,
+					 subrepo);
+			p->work_counter--;
+			if (p->work_counter == 0 && p->all_work_added) {
+				repo_clear(&p->subrepo);
+				free(p);
+			}
+		}
+
 		grep_source_clear(&w->source);
 	}
 
@@ -179,6 +215,7 @@ static void *run(void *arg)
 			break;
 
 		opt->output_priv = w;
+		opt->repo = w->repo;
 		hit |= grep_source(opt, &w->source);
 		grep_source_clear_data(&w->source);
 		work_done(w);
@@ -295,12 +332,14 @@ static int grep_cmd_config(const char *var, const char *value, void *cb)
 	return st;
 }
 
-static void *lock_and_read_oid_file(const struct object_id *oid, enum object_type *type, unsigned long *size)
+static void *lock_and_read_oid_file(struct repository *r,
+				    const struct object_id *oid,
+				    enum object_type *type, unsigned long *size)
 {
 	void *data;
 
 	grep_read_lock();
-	data = read_object_file(oid, type, size);
+	data = repo_read_object_file(r, oid, type, size);
 	grep_read_unlock();
 	return data;
 }
@@ -405,7 +444,8 @@ static int grep_submodule(struct grep_opt *opt,
 			  const struct object_id *oid,
 			  const char *filename, const char *path, int cached)
 {
-	struct repository subrepo;
+	struct progress_in_subrepo *p = xcalloc(1, sizeof(*p));
+	struct repository *subrepo = &p->subrepo;
 	struct repository *superproject = opt->repo;
 	const struct submodule *sub = submodule_from_path(superproject,
 							  &null_oid, path);
@@ -422,31 +462,21 @@ static int grep_submodule(struct grep_opt *opt,
 
 	if (!is_submodule_active(superproject, path)) {
 		grep_read_unlock();
+		free(p);
 		return 0;
 	}
 
-	if (repo_submodule_init(&subrepo, superproject, sub)) {
+	if (repo_submodule_init(subrepo, superproject, sub)) {
 		grep_read_unlock();
+		free(p);
 		return 0;
 	}
 
-	repo_read_gitmodules(&subrepo);
-
-	/*
-	 * NEEDSWORK: This adds the submodule's object directory to the list of
-	 * alternates for the single in-memory object store.  This has some bad
-	 * consequences for memory (processed objects will never be freed) and
-	 * performance (this increases the number of pack files git has to pay
-	 * attention to, to the sum of the number of pack files in all the
-	 * repositories processed so far).  This can be removed once the object
-	 * store is no longer global and instead is a member of the repository
-	 * object.
-	 */
-	add_to_alternates_memory(subrepo.objects->odb->path);
+	repo_read_gitmodules(subrepo);
 	grep_read_unlock();
 
 	memcpy(&subopt, opt, sizeof(subopt));
-	subopt.repo = &subrepo;
+	subopt.repo = subrepo;
 
 	if (oid) {
 		struct object *object;
@@ -455,10 +485,10 @@ static int grep_submodule(struct grep_opt *opt,
 		unsigned long size;
 		struct strbuf base = STRBUF_INIT;
 
-		object = parse_object_or_die(the_repository, oid, NULL);
+		object = parse_object_or_die(subrepo, oid, NULL);
 
 		grep_read_lock();
-		data = read_object_with_reference(&subrepo,
+		data = read_object_with_reference(subrepo,
 						  &object->oid, tree_type,
 						  &size, NULL);
 		grep_read_unlock();
@@ -478,7 +508,13 @@ static int grep_submodule(struct grep_opt *opt,
 		hit = grep_cache(&subopt, pathspec, cached);
 	}
 
-	repo_clear(&subrepo);
+	grep_lock();
+	p->all_work_added = 1;
+	if (p->work_counter == 0) {
+		repo_clear(&p->subrepo);
+		free(p);
+	}
+	grep_unlock();
 	return hit;
 }
 
@@ -587,7 +623,7 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 			void *data;
 			unsigned long size;
 
-			data = lock_and_read_oid_file(&entry.oid, &type, &size);
+			data = lock_and_read_oid_file(repo, &entry.oid, &type, &size);
 			if (!data)
 				die(_("unable to read tree (%s)"),
 				    oid_to_hex(&entry.oid));
diff --git a/grep.c b/grep.c
index cd952ef5d3..0b3f38aaae 100644
--- a/grep.c
+++ b/grep.c
@@ -10,9 +10,8 @@
 #include "quote.h"
 #include "help.h"
 
-static int grep_source_load(struct grep_source *gs);
-static int grep_source_is_binary(struct grep_source *gs,
-				 struct index_state *istate);
+static int grep_source_load(struct repository *r, struct grep_source *gs);
+static int grep_source_is_binary(struct repository *r, struct grep_source *gs);
 
 static struct grep_opt grep_defaults;
 
@@ -1714,7 +1713,7 @@ static int fill_textconv_grep(struct repository *r,
 	size_t size;
 
 	if (!driver || !driver->textconv)
-		return grep_source_load(gs);
+		return grep_source_load(r, gs);
 
 	/*
 	 * The textconv interface is intimately tied to diff_filespecs, so we
@@ -1820,11 +1819,11 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	if (!textconv) {
 		switch (opt->binary) {
 		case GREP_BINARY_DEFAULT:
-			if (grep_source_is_binary(gs, opt->repo->index))
+			if (grep_source_is_binary(opt->repo, gs))
 				binary_match_only = 1;
 			break;
 		case GREP_BINARY_NOMATCH:
-			if (grep_source_is_binary(gs, opt->repo->index))
+			if (grep_source_is_binary(opt->repo, gs))
 				return 0; /* Assume unmatch */
 			break;
 		case GREP_BINARY_TEXT:
@@ -2105,12 +2104,12 @@ void grep_source_clear_data(struct grep_source *gs)
 	}
 }
 
-static int grep_source_load_oid(struct grep_source *gs)
+static int grep_source_load_oid(struct repository *r, struct grep_source *gs)
 {
 	enum object_type type;
 
 	grep_read_lock();
-	gs->buf = read_object_file(gs->identifier, &type, &gs->size);
+	gs->buf = repo_read_object_file(r, gs->identifier, &type, &gs->size);
 	grep_read_unlock();
 
 	if (!gs->buf)
@@ -2154,7 +2153,7 @@ static int grep_source_load_file(struct grep_source *gs)
 	return 0;
 }
 
-static int grep_source_load(struct grep_source *gs)
+static int grep_source_load(struct repository *r, struct grep_source *gs)
 {
 	if (gs->buf)
 		return 0;
@@ -2163,7 +2162,7 @@ static int grep_source_load(struct grep_source *gs)
 	case GREP_SOURCE_FILE:
 		return grep_source_load_file(gs);
 	case GREP_SOURCE_OID:
-		return grep_source_load_oid(gs);
+		return grep_source_load_oid(r, gs);
 	case GREP_SOURCE_BUF:
 		return gs->buf ? 0 : -1;
 	}
@@ -2184,14 +2183,13 @@ void grep_source_load_driver(struct grep_source *gs,
 	grep_attr_unlock();
 }
 
-static int grep_source_is_binary(struct grep_source *gs,
-				 struct index_state *istate)
+static int grep_source_is_binary(struct repository *r, struct grep_source *gs)
 {
-	grep_source_load_driver(gs, istate);
+	grep_source_load_driver(gs, r->index);
 	if (gs->driver->binary != -1)
 		return gs->driver->binary;
 
-	if (!grep_source_load(gs))
+	if (!grep_source_load(r, gs))
 		return buffer_is_binary(gs->buf, gs->size);
 
 	return 0;
-- 
2.23.0




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux