Re: [PATCH v2 4/6] grep: optionally recurse into submodules

Stefan Beller <sbeller@xxxxxxxxxx> · Tue, 1 Nov 2016 10:26:40 -0700

On Mon, Oct 31, 2016 at 3:38 PM, Brandon Williams <bmwill@xxxxxxxxxx> wrote:

>
> +--recurse-submodules::
> +       Recursively search in each submodule that has been initialized and
> +       checked out in the repository.
> +

and warn otherwise.

> +
> +       /*
> +        * Limit number of threads for child process to use.
> +        * This is to prevent potential fork-bomb behavior of git-grep as each
> +        * submodule process has its own thread pool.
> +        */
> +       if (num_threads)
> +               argv_array_pushf(&submodule_options, "--threads=%d",
> +                                (num_threads + 1) / 2);

Just like in the run_parallel machinery this seems like an approximate
workaround. I'm ok with that for now.

Ideally the parent/child can send each other signals to hand
over threads. (SIGUSR1/SIGUSR2 would be enough to do that,
though I wonder if that is as portable as I would hope. Or we'd look at
"make" and see how they handle recursive calls.

> +
> +       /*
> +        * Capture output to output buffer and check the return code from the
> +        * child process.  A '0' indicates a hit, a '1' indicates no hit and
> +        * anything else is an error.
> +        */
> +       status = capture_command(&cp, &w->out, 0);
> +       if (status && (status != 1))

Does the user have enough information what went wrong?
Is the child verbose enough, such that we do not need to give a
die[_errno]("submodule processs failed") ?

> +static int grep_submodule(struct grep_opt *opt, const unsigned char *sha1,
> +                         const char *filename, const char *path)
> +{
> +       if (!(is_submodule_initialized(path) &&

If it is not initialized, the user "obviously" doesn't care, so maybe
we only need to warn
if init, but not checked out?

> +             is_submodule_checked_out(path))) {
> +               warning("skiping submodule '%s%s' since it is not initialized and checked out",
> +                       super_prefix ? super_prefix : "",
> +                       path);
> +               return 0;
> +       }
> +
> +#ifndef NO_PTHREADS
> +       if (num_threads) {
> +               add_work(opt, GREP_SOURCE_SUBMODULE, filename, path, sha1);
> +               return 0;
> +       } else
> +#endif
> +       {
> +               struct work_item w;
> +               int hit;
> +
> +               grep_source_init(&w.source, GREP_SOURCE_SUBMODULE,
> +                                filename, path, sha1);
> +               strbuf_init(&w.out, 0);
> +               opt->output_priv = &w;
> +               hit = grep_submodule_launch(opt, &w.source);
> +
> +               write_or_die(1, w.out.buf, w.out.len);
> +
> +               grep_source_clear(&w.source);
> +               strbuf_release(&w.out);
> +               return hit;
> +       }
> +}
> +
> +static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec,
> +                     int cached)
>  {
>         int hit = 0;
>         int nr;
> +       struct strbuf name = STRBUF_INIT;
> +       int name_base_len = 0;
> +       if (super_prefix) {
> +               name_base_len = strlen(super_prefix);
> +               strbuf_addstr(&name, super_prefix);
> +       }
> +
>         read_cache();
>
>         for (nr = 0; nr < active_nr; nr++) {
>                 const struct cache_entry *ce = active_cache[nr];
> -               if (!S_ISREG(ce->ce_mode))
> -                       continue;
> -               if (!ce_path_match(ce, pathspec, NULL))
> -                       continue;
> -               /*
> -                * If CE_VALID is on, we assume worktree file and its cache entry
> -                * are identical, even if worktree file has been modified, so use
> -                * cache version instead
> -                */
> -               if (cached || (ce->ce_flags & CE_VALID) || ce_skip_worktree(ce)) {
> -                       if (ce_stage(ce) || ce_intent_to_add(ce))
> -                               continue;
> -                       hit |= grep_sha1(opt, ce->oid.hash, ce->name, 0,
> -                                        ce->name);
> +               strbuf_setlen(&name, name_base_len);
> +               strbuf_addstr(&name, ce->name);
> +
> +               if (S_ISREG(ce->ce_mode) &&
> +                   match_pathspec(pathspec, name.buf, name.len, 0, NULL,
> +                                  S_ISDIR(ce->ce_mode) ||
> +                                  S_ISGITLINK(ce->ce_mode))) {

Why do we have to pass the ISDIR and ISGITLINK here for the regular file
case? ce_path_match and match_pathspec are doing the same thing?

> +                       /*
> +                        * If CE_VALID is on, we assume worktree file and its
> +                        * cache entry are identical, even if worktree file has
> +                        * been modified, so use cache version instead
> +                        */
> +                       if (cached || (ce->ce_flags & CE_VALID) ||
> +                           ce_skip_worktree(ce)) {
> +                               if (ce_stage(ce) || ce_intent_to_add(ce))
> +                                       continue;
> +                               hit |= grep_sha1(opt, ce->oid.hash, ce->name,
> +                                                0, ce->name);
> +                       } else {
> +                               hit |= grep_file(opt, ce->name);
> +                       }
> +               } else if (recurse_submodules && S_ISGITLINK(ce->ce_mode) &&
> +                          submodule_path_match(pathspec, name.buf, NULL)) {
> +                       hit |= grep_submodule(opt, NULL, ce->name, ce->name);

What is the difference between the last two parameters?

> + * filename: name of the submodule including tree name of parent
> + * path: location of the submodule

That sounds the same to me.

>         }
>
> +       if (recurse_submodules && (!use_index || untracked || list.nr))
> +               die(_("option not supported with --recurse-submodules."));

The user asks: Which option?

> +
> +test_expect_success 'grep and nested submodules' '
> +       git init submodule/sub &&
> +       echo "foobar" >submodule/sub/a &&
> +       git -C submodule/sub add a &&
> +       git -C submodule/sub commit -m "add a" &&
> +       git -C submodule submodule add ./sub &&
> +       git -C submodule add sub &&
> +       git -C submodule commit -m "added sub" &&
> +       git add submodule &&
> +       git commit -m "updated submodule" &&

Both in this test as well as in the setup, we setup a repository
with submodules, that have clean working dirs.

What should happen with dirty working dirs. dirty in the sense:
* file untracked in the submodule
* file added in the submodule, but not committed
* file committed in the submodule, that commit is
   untracked in the superproject
* file committed in the submodule, that commit is
  added to the index in the superproject
* (last case is just as above:) file committed in submodule,
   that commit was committed into the superproject.