Re: [PATCH] do_compare_entry: use already-computed path

Junio C Hamano <gitster@xxxxxxxxx> · Mon, 21 Dec 2015 09:17:23 -0800

David Turner <dturner@xxxxxxxxxxxxxxxx> writes:

> In traverse_trees, we generate the complete traverse path for a
> traverse_info.  Later, in do_compare_entry, we used to go do a bunch
> of work to compare the traverse_info to a cache_entry's name without
> computing that path.  But since we already have that path, we don't
> need to do all that work.  Instead, we can just stuff the generated
> path into the traverse_info, and do the comparison more directly.
> This makes git checkout much faster -- about 25% on Twitter's
> monorepo.  Deeper directory trees are likely to benefit more than
> shallower ones.

Great work.

IIRC, very early incarnation of the code avoided hard to build the
full path and that was why the path-component-wise comparison was
used heavily in this codepath; at some point we just gave up, I
think.  If we can pass this flattened representation around to
callees that can make good use of it, that makes tons of sense.  I
like the basic idea of this change.

I am bit worried that &base is passed to some function from here,
and they do not take "const struct strbuf *", but a non-const one,
allowing them to potentially modify its contents while this new
field in info struct wants to have the original base.buf, but I
trust you traced the callchain to make sure nothing wrong happens?
Even if that is the case, I feel that this change is setting up a
trap somebody else would easily trigger unknowingly--I wonder how
we can avoid future breakages.

> Signed-off-by: David Turner <dturner@xxxxxxxxxxxxxxxx>
> ---
>  tree-walk.c    |  4 ++++
>  tree-walk.h    |  1 +
>  unpack-trees.c | 41 +++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 44 insertions(+), 2 deletions(-)
>
> diff --git a/tree-walk.c b/tree-walk.c
> index 6dccd2d..4cab3e1 100644
> --- a/tree-walk.c
> +++ b/tree-walk.c
> @@ -329,6 +329,9 @@ int traverse_trees(int n, struct tree_desc *t, struct traverse_info *info)
>  		make_traverse_path(base.buf, info->prev, &info->name);
>  		base.buf[info->pathlen-1] = '/';
>  		strbuf_setlen(&base, info->pathlen);
> +		info->traverse_path = base.buf;
> +	} else {
> +		info->traverse_path = info->name.path;
>  	}
>  	for (;;) {
>  		int trees_used;
> @@ -411,6 +414,7 @@ int traverse_trees(int n, struct tree_desc *t, struct traverse_info *info)
>  	for (i = 0; i < n; i++)
>  		free_extended_entry(tx + i);
>  	free(tx);
> +	info->traverse_path = NULL;
>  	strbuf_release(&base);
>  	return error;
>  }
> diff --git a/tree-walk.h b/tree-walk.h
> index 3b2f7bf..174eb61 100644
> --- a/tree-walk.h
> +++ b/tree-walk.h
> @@ -59,6 +59,7 @@ enum follow_symlinks_result {
>  enum follow_symlinks_result get_tree_entry_follow_symlinks(unsigned char *tree_sha1, const char *name, unsigned char *result, struct strbuf *result_path, unsigned *mode);
>  
>  struct traverse_info {
> +	const char *traverse_path;
>  	struct traverse_info *prev;
>  	struct name_entry name;
>  	int pathlen;
> diff --git a/unpack-trees.c b/unpack-trees.c
> index 8e2032f..127dd4d 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -498,13 +498,14 @@ static int traverse_trees_recursive(int n, unsigned long dirmask,
>   * itself - the caller needs to do the final check for the cache
>   * entry having more data at the end!
>   */
> -static int do_compare_entry(const struct cache_entry *ce, const struct traverse_info *info, const struct name_entry *n)
> +static int do_compare_entry_piecewise(const struct cache_entry *ce, const struct traverse_info *info, const struct name_entry *n)
>  {
>  	int len, pathlen, ce_len;
>  	const char *ce_name;
>  
>  	if (info->prev) {
> -		int cmp = do_compare_entry(ce, info->prev, &info->name);
> +		int cmp = do_compare_entry_piecewise(ce, info->prev,
> +						     &info->name);
>  		if (cmp)
>  			return cmp;
>  	}
> @@ -522,6 +523,42 @@ static int do_compare_entry(const struct cache_entry *ce, const struct traverse_
>  	return df_name_compare(ce_name, ce_len, S_IFREG, n->path, len, n->mode);
>  }
>  
> +static int do_compare_entry(const struct cache_entry *ce,
> +			    const struct traverse_info *info,
> +			    const struct name_entry *n)
> +{
> +	int len, pathlen, ce_len;
> +	const char *ce_name;
> +	int cmp;
> +
> +	/*
> +	 * If we have not precomputed the traverse path, it is quicker
> +	 * to avoid doing so.  But if we have precomputed it,
> +	 * it is quicker to use the precomputed version.
> +	 */
> +	if (!info->traverse_path)
> +		return do_compare_entry_piecewise(ce, info, n);
> +
> +	cmp = strncmp(ce->name, info->traverse_path, info->pathlen);
> +	if (cmp)
> +		return cmp;
> +
> +	pathlen = info->pathlen;
> +	ce_len = ce_namelen(ce);
> +
> +	if (ce_len < pathlen) {
> +		if (do_compare_entry_piecewise(ce, info, n) >= 0)
> +			die("pathlen");
> +		return -1;
> +	}
> +
> +	ce_len -= pathlen;
> +	ce_name = ce->name + pathlen;
> +
> +	len = tree_entry_len(n);
> +	return df_name_compare(ce_name, ce_len, S_IFREG, n->path, len, n->mode);
> +}
> +
>  static int compare_entry(const struct cache_entry *ce, const struct traverse_info *info, const struct name_entry *n)
>  {
>  	int cmp = do_compare_entry(ce, info, n);
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html