Thanks Jonathan for your thoughtful comments, here is the product of the discussion: * I split up the patch to fetch in the worktree to be 2 patches, each giving motivation on its own. * the last patch is vastly simplified in code, but takes an extra test * in [1], you remark "commits" ought not to be a pointer, but I decided against that, as we keep the pointed-at value around for the same time span (until we're done with that submodule) and we don't need to copy over the pointed-at value into the new struct. [1] https://public-inbox.org/git/20181018003954.139498-1-jonathantanmy@xxxxxxxxxx/ This is still based on ao/submodule-wo-gitmodules-checked-out. previous version https://public-inbox.org/git/20181016181327.107186-1-sbeller@xxxxxxxxxx/ Stefan Beller (10): sha1-array: provide oid_array_filter submodule.c: fix indentation submodule.c: sort changed_submodule_names before searching it submodule.c: tighten scope of changed_submodule_names struct submodule: store OIDs in changed_submodule_names repository: repo_submodule_init to take a submodule struct submodule: migrate get_next_submodule to use repository structs submodule.c: fetch in submodules git directory instead of in worktree fetch: try fetching submodules if needed objects were not fetched builtin/fetch: check for submodule updates in any ref update Documentation/technical/api-oid-array.txt | 5 + builtin/fetch.c | 11 +- builtin/grep.c | 17 +- builtin/ls-files.c | 12 +- builtin/submodule--helper.c | 2 +- repository.c | 27 +- repository.h | 12 +- sha1-array.c | 17 ++ sha1-array.h | 3 + submodule.c | 265 ++++++++++++++++--- t/helper/test-submodule-nested-repo-config.c | 8 +- t/t5526-fetch-submodules.sh | 55 ++++ 12 files changed, 346 insertions(+), 88 deletions(-) git range-diff origin/sb/submodule-recursive-fetch-gets-the-tip... 1: 3fbb06aedd ! 1: 0fdb0e2ad9 submodule.c: move global changed_submodule_names into fetch submodule struct @@ -1,12 +1,11 @@ Author: Stefan Beller <sbeller@xxxxxxxxxx> - submodule.c: move global changed_submodule_names into fetch submodule struct + submodule.c: tighten scope of changed_submodule_names struct The `changed_submodule_names` are only used for fetching, so let's make it part of the struct that is passed around for fetching submodules. Signed-off-by: Stefan Beller <sbeller@xxxxxxxxxx> - Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx> diff --git a/submodule.c b/submodule.c --- a/submodule.c @@ -16,7 +15,6 @@ static int config_update_recurse_submodules = RECURSE_SUBMODULES_OFF; -static struct string_list changed_submodule_names = STRING_LIST_INIT_DUP; -+ static int initialized_fetch_ref_tips; static struct oid_array ref_tips_before_fetch; static struct oid_array ref_tips_after_fetch; @@ -25,22 +23,8 @@ } -static void calculate_changed_submodule_paths(void) -+struct submodule_parallel_fetch { -+ int count; -+ struct argv_array args; -+ struct repository *r; -+ const char *prefix; -+ int command_line_option; -+ int default_option; -+ int quiet; -+ int result; -+ -+ struct string_list changed_submodule_names; -+}; -+#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, STRING_LIST_INIT_DUP } -+ +static void calculate_changed_submodule_paths( -+ struct submodule_parallel_fetch *spf) ++ struct string_list *changed_submodule_names) { struct argv_array argv = ARGV_ARRAY_INIT; struct string_list changed_submodules = STRING_LIST_INIT_DUP; @@ -49,30 +33,23 @@ if (!submodule_has_commits(path, commits)) - string_list_append(&changed_submodule_names, name->string); -+ string_list_append(&spf->changed_submodule_names, ++ string_list_append(changed_submodule_names, + name->string); } free_submodules_oids(&changed_submodules); @@ - return ret; - } - --struct submodule_parallel_fetch { -- int count; -- struct argv_array args; -- struct repository *r; -- const char *prefix; -- int command_line_option; -- int default_option; -- int quiet; -- int result; --}; + int default_option; + int quiet; + int result; ++ ++ struct string_list changed_submodule_names; + }; -#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0} -- ++#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, STRING_LIST_INIT_DUP } + static int get_fetch_recurse_config(const struct submodule *submodule, struct submodule_parallel_fetch *spf) - { @@ case RECURSE_SUBMODULES_ON_DEMAND: if (!submodule || @@ -88,7 +65,7 @@ - calculate_changed_submodule_paths(); - string_list_sort(&changed_submodule_names); -+ calculate_changed_submodule_paths(&spf); ++ calculate_changed_submodule_paths(&spf.changed_submodule_names); + string_list_sort(&spf.changed_submodule_names); run_processes_parallel(max_parallel_jobs, get_next_submodule, 2: a64a8322a1 ! 2: a11e6e39a2 submodule.c: do not copy around submodule list @@ -1,6 +1,6 @@ Author: Stefan Beller <sbeller@xxxxxxxxxx> - submodule.c: do not copy around submodule list + submodule: store OIDs in changed_submodule_names 'calculate_changed_submodule_paths' uses a local list to compute the changed submodules, and then produces the result by copying appropriate @@ -14,13 +14,13 @@ useful in a later patch. Signed-off-by: Stefan Beller <sbeller@xxxxxxxxxx> - Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx> + Reviewed-by: Jonathan Tan <jonathantanmy@xxxxxxxxxx> diff --git a/submodule.c b/submodule.c --- a/submodule.c +++ b/submodule.c @@ - struct submodule_parallel_fetch *spf) + struct string_list *changed_submodule_names) { struct argv_array argv = ARGV_ARRAY_INIT; - struct string_list changed_submodules = STRING_LIST_INIT_DUP; @@ -34,10 +34,10 @@ * commits have been recorded upstream in "changed_submodule_names". */ - collect_changed_submodules(&changed_submodules, &argv); -+ collect_changed_submodules(&spf->changed_submodule_names, &argv); ++ collect_changed_submodules(changed_submodule_names, &argv); - for_each_string_list_item(name, &changed_submodules) { -+ for_each_string_list_item(name, &spf->changed_submodule_names) { ++ for_each_string_list_item(name, changed_submodule_names) { struct oid_array *commits = name->util; const struct submodule *submodule; const char *path = NULL; @@ -46,7 +46,7 @@ continue; - if (!submodule_has_commits(path, commits)) -- string_list_append(&spf->changed_submodule_names, +- string_list_append(changed_submodule_names, - name->string); + if (submodule_has_commits(path, commits)) { + oid_array_clear(commits); @@ -55,7 +55,7 @@ } - free_submodules_oids(&changed_submodules); -+ string_list_remove_empty_items(&spf->changed_submodule_names, 1); ++ string_list_remove_empty_items(changed_submodule_names, 1); + argv_array_clear(&argv); oid_array_clear(&ref_tips_before_fetch); 3: 9341b92c83 ! 3: 3f4e0d4b8d repository: repo_submodule_init to take a submodule struct @@ -5,17 +5,19 @@ When constructing a struct repository for a submodule for some revision of the superproject where the submodule is not contained in the index, it may not be present in the working tree currently either. In that - siutation giving a 'path' argument is not useful. Upgrade the + situation giving a 'path' argument is not useful. Upgrade the repo_submodule_init function to take a struct submodule instead. + The submodule struct can be obtained via submodule_from_{path, name} or + an artificial submodule struct can be passed in. - While we are at it, overhaul the repo_submodule_init function by renaming - the submodule repository struct, which is to be initialized, to a name - that is not confused with the struct submodule as easily. + While we are at it, rename the repository struct in the repo_submodule_init + function, which is to be initialized, to a name that is not confused with + the struct submodule as easily. Perform such renames in similar functions + as well. Also move its documentation into the header file. Signed-off-by: Stefan Beller <sbeller@xxxxxxxxxx> - Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx> diff --git a/builtin/grep.c b/builtin/grep.c --- a/builtin/grep.c @@ -104,6 +106,19 @@ static void show_ce(struct repository *repo, struct dir_struct *dir, +diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c +--- a/builtin/submodule--helper.c ++++ b/builtin/submodule--helper.c +@@ + if (!sub) + BUG("We could get the submodule handle before?"); + +- if (repo_submodule_init(&subrepo, the_repository, path)) ++ if (repo_submodule_init(&subrepo, the_repository, sub)) + die(_("could not get a repository handle for submodule '%s'"), path); + + if (!repo_config_get_string(&subrepo, "core.worktree", &cw)) { + diff --git a/repository.c b/repository.c --- a/repository.c +++ b/repository.c @@ -178,7 +193,8 @@ +/* + * Initialize the repository 'subrepo' as the submodule given by the + * struct submodule 'sub' in parent repository 'superproject'. -+ * Return 0 upon success and a non-zero value upon failure. ++ * Return 0 upon success and a non-zero value upon failure, which may happen ++ * if the submodule is not found, or 'sub' is NULL. + */ +struct submodule; +int repo_submodule_init(struct repository *subrepo, 4: cea909cbd4 ! 4: b1566069e7 submodule: fetch in submodules git directory instead of in worktree @@ -1,44 +1,19 @@ Author: Stefan Beller <sbeller@xxxxxxxxxx> - submodule: fetch in submodules git directory instead of in worktree + submodule: migrate get_next_submodule to use repository structs - This patch started as a refactoring to make 'get_next_submodule' more - readable, but upon doing so, I realized that "git fetch" of the submodule - actually doesn't need to be run in the submodules worktree. So let's run - it in its git dir instead. + We used to recurse into submodules, even if they were broken having + only an objects directory. The child process executed in the submodule + would fail though if the submodule was broken. - That should pave the way towards fetching submodules that are currently - not checked out. - - This patch leaks the cp->dir in get_next_submodule, as any further - callback in run_processes_parallel doesn't have access to the child - process any more. In an early iteration of this patch, the function - get_submodule_repo_for directly returned the string containing the - git directory, which would be a better design choice for this patch. - - However the next patch both fixes the memory leak of cp->dir and also has - a use case for using the full repository handle of the submodule, so - it makes sense to introduce the get_submodule_repo_for here already. + This patch tightens the check upfront, such that we do not need + to spawn a child process to find out if the submodule is broken. Signed-off-by: Stefan Beller <sbeller@xxxxxxxxxx> - Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx> diff --git a/submodule.c b/submodule.c --- a/submodule.c +++ b/submodule.c -@@ - DEFAULT_GIT_DIR_ENVIRONMENT); - } - -+static void prepare_submodule_repo_env_in_gitdir(struct argv_array *out) -+{ -+ prepare_submodule_repo_env_no_git_dir(out); -+ argv_array_pushf(out, "%s=.", GIT_DIR_ENVIRONMENT); -+} -+ - /* Helper function to display the submodule header line prior to the full - * summary output. If it can locate the submodule objects directory it will - * attempt to lookup both the left and right commits and put them into the @@ return spf->default_option; } @@ -99,9 +74,8 @@ + if (repo) { child_process_init(cp); - cp->dir = strbuf_detach(&submodule_path, NULL); -- prepare_submodule_repo_env(&cp->env_array); -+ prepare_submodule_repo_env_in_gitdir(&cp->env_array); -+ cp->dir = xstrdup(repo->gitdir); + prepare_submodule_repo_env(&cp->env_array); ++ cp->dir = xstrdup(repo->worktree); cp->git_cmd = 1; if (!spf->quiet) strbuf_addf(err, "Fetching submodule %s%s\n", @@ -113,27 +87,17 @@ + repo_clear(repo); + free(repo); ret = 1; ++ } else { ++ /* ++ * An empty directory is normal, ++ * the submodule is not initialized ++ */ ++ if (S_ISGITLINK(ce->ce_mode) && ++ !is_empty_dir(ce->name)) ++ die(_("Could not access submodule '%s'"), ce->name); } - strbuf_release(&submodule_path); - strbuf_release(&submodule_git_dir); strbuf_release(&submodule_prefix); if (ret) { spf->count++; - -diff --git a/t/t5526-fetch-submodules.sh b/t/t5526-fetch-submodules.sh ---- a/t/t5526-fetch-submodules.sh -+++ b/t/t5526-fetch-submodules.sh -@@ - - test_must_fail git -C dst status && - test_must_fail git -C dst diff && -- test_must_fail git -C dst fetch --recurse-submodules -+ -+ # git-fetch cannot find the git directory of the submodule, -+ # so it will do nothing, successfully, as it cannot distinguish between -+ # this broken submodule and a submodule that was just set active but -+ # not cloned yet -+ git -C dst fetch --recurse-submodules - ' - - test_expect_success "fetch new commits when submodule got renamed" ' -: ---------- > 5: 2d98ff1201 submodule.c: fetch in submodules git directory instead of in worktree 5: 9ad0125310 ! 6: 092b9cbcba fetch: retry fetching submodules if needed objects were not fetched @@ -1,25 +1,23 @@ Author: Stefan Beller <sbeller@xxxxxxxxxx> - fetch: retry fetching submodules if needed objects were not fetched + fetch: try fetching submodules if needed objects were not fetched Currently when git-fetch is asked to recurse into submodules, it dispatches a plain "git-fetch -C <submodule-dir>" (with some submodule related options such as prefix and recusing strategy, but) without any information of the remote or the tip that should be fetched. - This works surprisingly well in some workflows (such as using submodules - as a third party library), while not so well in other scenarios, such - as in a Gerrit topic-based workflow, that can tie together changes - (potentially across repositories) on the server side. One of the parts - of such a Gerrit workflow is to download a change when wanting to examine - it, and you'd want to have its submodule changes that are in the same - topic downloaded as well. However these submodule changes reside in their - own repository in their own ref (refs/changes/<int>). + But this default fetch is not sufficient, as a newly fetched commit in + the superproject could point to a commit in the submodule that is not + in the default refspec. This is common in workflows like Gerrit's. + When fetching a Gerrit change under review (from refs/changes/??), the + commits in that change likely point to submodule commits that have not + been merged to a branch yet. - Retry fetching a submodule by object name if the object id that the + Try fetching a submodule by object id if the object id that the superproject points to, cannot be found. - This retrying does not happen when the "git fetch" done at the + The try does not happen when the "git fetch" done at the superproject is not storing the fetched results in remote tracking branches (i.e. instead just recording them to FETCH_HEAD) in this step. A later patch will fix this. @@ -32,7 +30,6 @@ in case the submodule recursion is set to "on". Signed-off-by: Stefan Beller <sbeller@xxxxxxxxxx> - Signed-off-by: Junio C Hamano <gitster@xxxxxxxxx> diff --git a/builtin/fetch.c b/builtin/fetch.c --- a/builtin/fetch.c @@ -75,16 +72,16 @@ int result; struct string_list changed_submodule_names; -+ struct get_next_submodule_task **retry; -+ int retry_nr, retry_alloc; ++ struct get_next_submodule_task **fetch_specific_oids; ++ int fetch_specific_oids_nr, fetch_specific_oids_alloc; }; -#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, STRING_LIST_INIT_DUP } +#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, \ + STRING_LIST_INIT_DUP, \ + NULL, 0, 0} - static void calculate_changed_submodule_paths( - struct submodule_parallel_fetch *spf) + static int get_fetch_recurse_config(const struct submodule *submodule, + struct submodule_parallel_fetch *spf) @@ return spf->default_option; } @@ -93,6 +90,8 @@ + struct repository *repo; + const struct submodule *sub; + unsigned free_sub : 1; /* Do we need to free the submodule? */ ++ ++ /* fetch specific oids if set, otherwise fetch default refspec */ + struct oid_array *commits; +}; + @@ -224,24 +223,29 @@ - repo_clear(repo); - free(repo); - ret = 1; -- } -- strbuf_release(&submodule_prefix); -- if (ret) { - spf->count++; ++ spf->count++; + *task_cb = task; + + strbuf_release(&submodule_prefix); - return 1; -+ } else { ++ return 1; + } else { ++ + get_next_submodule_task_release(task); + free(task); ++ + /* + * An empty directory is normal, + * the submodule is not initialized +@@ + !is_empty_dir(ce->name)) + die(_("Could not access submodule '%s'"), ce->name); } - } ++ } + -+ if (spf->retry_nr) { -+ struct get_next_submodule_task *task = spf->retry[spf->retry_nr - 1]; ++ if (spf->fetch_specific_oids_nr) { ++ struct get_next_submodule_task *task = spf->fetch_specific_oids[spf->fetch_specific_oids_nr - 1]; + struct strbuf submodule_prefix = STRBUF_INIT; -+ spf->retry_nr--; ++ spf->fetch_specific_oids_nr--; + + strbuf_addf(&submodule_prefix, "%s%s/", spf->prefix, task->sub->path); + @@ -262,9 +266,13 @@ + append_oid_to_argv, &cp->args); + + *task_cb = task; -+ strbuf_release(&submodule_prefix); + strbuf_release(&submodule_prefix); +- if (ret) { +- spf->count++; +- return 1; +- } + return 1; -+ } + } + return 0; } @@ -326,9 +334,11 @@ + return 0; + + task->commits = commits; -+ ALLOC_GROW(spf->retry, spf->retry_nr + 1, spf->retry_alloc); -+ spf->retry[spf->retry_nr] = task; -+ spf->retry_nr++; ++ ALLOC_GROW(spf->fetch_specific_oids, ++ spf->fetch_specific_oids_nr + 1, ++ spf->fetch_specific_oids_alloc); ++ spf->fetch_specific_oids[spf->fetch_specific_oids_nr] = task; ++ spf->fetch_specific_oids_nr++; + return 0; + } + @@ -346,18 +356,33 @@ test_cmp expect actual ' -+test_expect_success "fetch new commits on-demand when they are not reachable" ' ++test_expect_success "fetch new submodule commits on-demand outside standard refspec" ' ++ # add a second submodule and ensure it is around in downstream first ++ git clone submodule sub1 && ++ git submodule add ./sub1 && ++ git commit -m "adding a second submodule" && ++ git -C downstream pull && ++ git -C downstream submodule update --init --recursive && ++ + git checkout --detach && ++ + C=$(git -C submodule commit-tree -m "new change outside refs/heads" HEAD^{tree}) && + git -C submodule update-ref refs/changes/1 $C && + git update-index --cacheinfo 160000 $C submodule && -+ git commit -m "updated submodule outside of refs/heads" && -+ D=$(git rev-parse HEAD) && -+ git update-ref refs/changes/2 $D && ++ test_tick && ++ ++ D=$(git -C sub1 commit-tree -m "new change outside refs/heads" HEAD^{tree}) && ++ git -C sub1 update-ref refs/changes/2 $D && ++ git update-index --cacheinfo 160000 $D sub1 && ++ ++ git commit -m "updated submodules outside of refs/heads" && ++ E=$(git rev-parse HEAD) && ++ git update-ref refs/changes/2 $E && + ( + cd downstream && -+ git fetch --recurse-submodules --recurse-submodules-default on-demand origin refs/changes/2:refs/heads/my_branch && ++ git fetch --recurse-submodules origin refs/changes/2:refs/heads/my_branch && + git -C submodule cat-file -t $C && ++ git -C sub1 cat-file -t $D && + git checkout --recurse-submodules FETCH_HEAD + ) +' 6: b8db3b45bf < -: ---------- builtin/fetch: check for submodule updates for non branch fetches 7: 905a4f0d4f < -: ---------- fixup! repository: repo_submodule_init to take a submodule struct -: ---------- > 7: 11bf819782 builtin/fetch: check for submodule updates in any ref update