Modified path Bloom filter don't store the names of modified paths, they only set a couple of bits based on those paths' hashes. Consequently, they can only be used when looking for the history of a concrete path, so we disabled them when looking for pathspecs with wildcards. However, if the pathspec has "wildcard-less" leading directories, then we can use modified path Bloom filters to skip commits that don't modify those leading directories. As a result, something like: git -c core.modifiedPathBloomFilters=1 rev-list HEAD -- 'compat/win32/pthread.*' will take only ~0.045s instead of ~1.24s, achieving over 27x speedup. For comparison, letting the shell do the wildcard matching, i.e. the equivalent of using the pathspecs 'compat/win32/pthread.c' and 'compat/win32/pthread.h' takes ~0.311s without using modified path Bloom filters: apparently tree-diff with wildcards can be considerably more expensive that without wildcards, even if the wildcard is in the last path component and that directory only contains a dozen files. Signed-off-by: SZEDER Gábor <szeder.dev@xxxxxxxxx> --- commit-graph.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 8eb0cbedaf..db43877426 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1023,9 +1023,9 @@ static void compute_modified_path_bloom_hashes_for_path(const char *path, void init_pathspec_bloom_fields(struct repository *r, struct pathspec *pathspec) { - const unsigned bloom_compatible_magic = PATHSPEC_LITERAL; + const unsigned bloom_compatible_magic = PATHSPEC_LITERAL | PATHSPEC_GLOB; struct commit_graph *graph = r->objects->commit_graph; - int i; + int i, can_use_modified_path_bloom_filters; if (!graph) return; @@ -1033,15 +1033,14 @@ void init_pathspec_bloom_fields(struct repository *r, return; if (!pathspec->nr) return; - if (pathspec->has_wildcard) - return; if (pathspec->magic & ~bloom_compatible_magic) return; + can_use_modified_path_bloom_filters = 1; for (i = 0; i < pathspec->nr; i++) { struct pathspec_item *pi = &pathspec->items[i]; const char *path = pi->match, *p; - size_t len = pi->len; + size_t nowildcard_len = pi->nowildcard_len; int path_component_nr = 0, j; uint32_t *hashes; struct bloom_filter embedded_bf; @@ -1051,14 +1050,29 @@ void init_pathspec_bloom_fields(struct repository *r, * slashes, but a trailing slash might still be present, * "remove" it. */ - if (path[len - 1] == '/') - len--; + if (path[nowildcard_len - 1] == '/') + nowildcard_len--; p = path; do { p = strchrnul(p + 1, '/'); - path_component_nr++; - } while (p - path < len); + if (p - path <= nowildcard_len) + path_component_nr++; + } while (p - path < nowildcard_len); + /* + * If a pathspec uses wildcards but has wildcard-less + * leading directories, then we can use modified path Bloom + * filters to skip commits that don't modify those leading + * directories. + * However, if there is even one pathspec that has a wilcard + * in its first path component, then we have no choice but + * to run tree-diff anyway, so don't bother with Bloom + * filters at all in that case. + */ + if (!path_component_nr) { + can_use_modified_path_bloom_filters = 0; + break; + } pi->modified_path_bloom_hashes_nr = path_component_nr * graph->num_modified_path_bloom_hashes; ALLOC_ARRAY(pi->modified_path_bloom_hashes, @@ -1084,7 +1098,17 @@ void init_pathspec_bloom_fields(struct repository *r, pi->modified_path_bloom_hashes_nr); } - pathspec->can_use_modified_path_bloom_filters = 1; + if (can_use_modified_path_bloom_filters) { + pathspec->can_use_modified_path_bloom_filters = 1; + } else { + int j; + for (j = 0; j < i; j++) { + struct pathspec_item *pi = &pathspec->items[j]; + FREE_AND_NULL(pi->modified_path_bloom_hashes); + pi->modified_path_bloom_hashes_nr = 0; + pi->modified_path_bloom_mask = 0; + } + } } struct packed_commit_list { -- 2.27.0.rc1.431.g5c813f95dc