Re: [PATCH 7/8] fetch: fetch unpopulated, changed submodules

Junio C Hamano <gitster@xxxxxxxxx> · Thu, 10 Feb 2022 14:49:31 -0800

Glen Choo <chooglen@xxxxxxxxxx> writes:

> @@ -1273,10 +1277,6 @@ static void calculate_changed_submodule_paths(struct repository *r,
>  	struct strvec argv = STRVEC_INIT;
>  	struct string_list_item *name;
>  
> -	/* No need to check if there are no submodules configured */
> -	if (!submodule_from_path(r, NULL, NULL))
> -		return;
> -

It looks to me that this hunk reverts 18322bad (fetch: skip
on-demand checking when no submodules are configured, 2011-09-09),
which tried to avoid high cost computation when we know there is no
submodule.  Intended?  Perhaps it should be replaced with an
equivalent check that (1) still says "we do care about submodules"
even if the current checkout has no submodules (i.e. ls-files shows
no gitlinks), but (2) says "no, there is nothing interesting" when
$GIT_COMMON_DIR/modules/ is empty or some other cheap check we can
use?

> +get_fetch_task_from_index(struct submodule_parallel_fetch *spf,
> +			  const char **default_argv, struct strbuf *err)
>  {
> -	for (; spf->count < spf->r->index->cache_nr; spf->count++) {
> -		const struct cache_entry *ce = spf->r->index->cache[spf->count];
> +	for (; spf->index_count < spf->r->index->cache_nr; spf->index_count++) {
> +		const struct cache_entry *ce =
> +			spf->r->index->cache[spf->index_count];
>  		struct fetch_task *task;
>  
>  		if (!S_ISGITLINK(ce->ce_mode))
> @@ -1495,6 +1499,15 @@ get_fetch_task(struct submodule_parallel_fetch *spf,
>  		if (!task)
>  			continue;
>  
> +		/*
> +		 * We might have already considered this submodule
> +		 * because we saw it when iterating the changed
> +		 * submodule names.
> +		 */
> +		if (string_list_lookup(&spf->seen_submodule_names,
> +				       task->sub->name))
> +			continue;
> +
>  		switch (get_fetch_recurse_config(task->sub, spf))
>  		{
>  		default:
> @@ -1542,7 +1555,69 @@ get_fetch_task(struct submodule_parallel_fetch *spf,
>  			strbuf_addf(err, _("Fetching submodule %s%s\n"),
>  				    spf->prefix, ce->name);
>  
> -		spf->count++;
> +		spf->index_count++;
> +		return task;
> +	}
> +	return NULL;
> +}

Sorry, but I am confused.  If we are gathering which submodules to
fetch from the changes to gitlinks in the range of superproject
changes, why do we even need to scan the index (i.e. the current
checkout in the superproject) to begin with?  If it was changed,
we'd know get_fetch_task_from_changed() would take care of it, and
if there was no change to the submodule between the superproject's
commits before and after the fetch, there is nothing gained from
fetching in the submodules, no?

> +static struct fetch_task *
> +get_fetch_task_from_changed(struct submodule_parallel_fetch *spf,
> +			    const char **default_argv, struct strbuf *err)
> +{

> @@ -1553,7 +1628,10 @@ static int get_next_submodule(struct child_process *cp, struct strbuf *err,
>  {
>  	struct submodule_parallel_fetch *spf = data;
>  	const char *default_argv = NULL;
> -	struct fetch_task *task = get_fetch_task(spf, &default_argv, err);
> +	struct fetch_task *task =
> +		get_fetch_task_from_index(spf, &default_argv, err);
> +	if (!task)
> +		task = get_fetch_task_from_changed(spf, &default_argv, err);

Hmph, intersting.  So if "from index" grabbed some submodules
already, then the "from the changes in the superproject, we know
these submodules need refreshing" is not happen at all?  I am afraid
that I am still not following this...