Load the Modified Path Bloom Filter Index (MPBI) chunk and check the embedded modified path Bloom filters within to speed up pathspec-limited revison walks. Extend 'struct pathspec_item' to hold the hashes of each path for modified path Bloom filters. Pathspecs are used by many Git commands, and a lot of them doesn't do any revision walk at all, so don't initialize those fields in parse_pathspec(), but only when we are about to start a pathspec-limited revision walk, i.e. in prepare_revision_walk(). Initialize these fields only in the revs->pruning.pathspec instance, because that's the one used to limit the revision walk. Note that some revision walks re-parse the pathspecs, notably 'git log --follow' and the line-level log, but they don't use this pathspec instance. The table below shows the average runtime and speedup of git rev-list HEAD -- "$path" for 5000+ randomly selected paths from each repository with and without modified path Bloom filters: Average runtime without with MPBI MPBI -------------------------------------------- android-base 0.8780s n/a cmssw 0.3143s 0.2104s -33.1% cpython 0.7453s 0.3410s -54.2% gcc 7.1852s 3.3633s -53.2% gecko-dev 4.6113s n/a git 0.6180s 0.2184s -64.7% glibc 0.5618s 0.3245s -42.2% jdk 0.0482s 0.0395s -18.0% linux 0.7043s 0.4810s -31.6% llvm-project 2.6844s n/a rails 0.2784s 0.1792s -35.6% rust 0.7757s 0.6073s -21.7% webkit 1.9137s 1.4832s -22.5% The results in the homebrew-core repository can be spectacular, though that repository looks like as if it was specifically designed to show off the benefits of this patch: over 98% of its almost 164k commits in its linear history modify one file within the 'Formula' directory, and that directory contains almost 5000 files. Consequently, almost all commits will have a 63 bit "embedded" modified path Bloom filter containing only 2 paths, resulting in a false positive rate less than 0.1%, and they can spare a lot of expensive tree entry scanning to the end of that directory. So looking at the history of the first and last files in that directory: Runtime Max RSS without with without with MPBI MPBI Speedup MPBI MPBI ----------------------------------------------------------------- Formula/a2ps.rb 8.390s 0.218s 38.5x 422884k 129212k Formula/zzz.rb 80.495s 0.176s 450.4x 423744k 68364k ('Formula/a2ps.rb' is modified 32 times and has two false positives, while 'zzz.rb' is modified only twice and happens to have only a single false positive; that's why querying the history of the former is slower, even though it's at the beginning of the directory). Signed-off-by: SZEDER Gábor <szeder.dev@xxxxxxxxx> --- commit-graph.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++ commit-graph.h | 13 +++++ pathspec.c | 9 +++ pathspec.h | 12 ++++ revision.c | 18 ++++++ 5 files changed, 198 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 413605a29c..fb24600bb3 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -26,6 +26,7 @@ #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */ #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ #define GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_INDEX 0x4d504249 /* "MPBI" */ +#define GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_EXCLUDES 0x4d504258 /* "MPBX" */ #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) @@ -308,6 +309,23 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd, chunk_repeated = 1; else graph->chunk_base_graphs = data + chunk_offset; + break; + + case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_INDEX: + if (graph->chunk_mpbf_index) + chunk_repeated = 1; + else { + graph->chunk_mpbf_index = data + chunk_offset; + graph->chunk_mpbf_index_size = next_chunk_offset - chunk_offset; + } + break; + + case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_EXCLUDES: + if (graph->chunk_mpbf_excludes) + chunk_repeated = 1; + else + graph->chunk_mpbf_excludes = data + chunk_offset; + break; } if (chunk_repeated) { @@ -324,6 +342,29 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd, return NULL; } + if (graph->chunk_mpbf_index) { + int res = 0; + git_config_get_bool("core.modifiedPathBloomFilters", &res); + if (res) { + uint64_t expected_size = sizeof(uint8_t) + + graph->num_commits * sizeof(uint64_t); + if (graph->chunk_mpbf_index_size != expected_size) + BUG(_("for %u commits the Modified Path Bloom Filter Index chunk should be %ld bytes, but it's %ld"), + graph->num_commits, expected_size, + graph->chunk_mpbf_index_size); + graph->num_modified_path_bloom_hashes = *graph->chunk_mpbf_index; + if (!graph->num_modified_path_bloom_hashes) + BUG(_("Modified Path Bloom Filter Index chunk using 0 hashes")); + if (graph->chunk_mpbf_excludes) + /* + * Modified Path Bloom Filter Excludes are + * not supported yet. + */ + graph->chunk_mpbf_index = NULL; + } else + graph->chunk_mpbf_index = NULL; + } + return graph; } @@ -777,6 +818,68 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit return get_commit_tree_in_graph_one(r, r->objects->commit_graph, c); } +static int load_modified_path_bloom_filter_from_graph( + struct commit_graph *graph, struct commit *commit, + struct commit *parent, struct bloom_filter *bf) +{ + const uint8_t *bloom_index; + int first_parent = 0; + + if (commit->graph_pos == COMMIT_NOT_FROM_GRAPH) + return 0; + if (!graph) + return 0; + if (!graph->chunk_mpbf_index) + return 0; + + if (!commit->parents || commit->parents->item == parent) + first_parent = 1; + + bloom_index = graph->chunk_mpbf_index + sizeof(uint8_t) + + sizeof(uint64_t) * commit->graph_pos; + + if (bloom_index[0] & (1 << 7)) { + uint64_t v; + if (!first_parent) + return 0; + memcpy(&v, bloom_index, sizeof(v)); + if (v == GRAPH_MODIFIED_PATH_BLOOM_FILTER_NONE) + return 0; + + /* embedded modified path Bloom filter */ + bf->nr_bits = GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS; + bf->bits = (uint8_t*) bloom_index; + return 1; + } + /* support for non-embedded Bloom filters is not implemented yet. */ + return 0; +} + +enum bloom_result check_modified_path_bloom_filter(struct repository *r, + struct pathspec *pathspec, struct commit *parent, + struct commit *commit) +{ + struct bloom_filter bf; + int i; + + if (!pathspec->can_use_modified_path_bloom_filters) + return BLOOM_CANT_TELL; + if (!load_modified_path_bloom_filter_from_graph(r->objects->commit_graph, + commit, parent, &bf)) + return BLOOM_CANT_TELL; + + for (i = 0; i < pathspec->nr; i++) { + struct pathspec_item *pi = &pathspec->items[i]; + + if (bloom_filter_check_bits(&bf, + pi->modified_path_bloom_hashes, + pi->modified_path_bloom_hashes_nr)) + return BLOOM_POSSIBLY_YES; + } + + return BLOOM_DEFINITELY_NOT; +} + static uint32_t modified_path_bloom_seeds[] = { 0xe83c5163U, 0x3b376b0cU, @@ -807,6 +910,49 @@ static void compute_modified_path_bloom_hashes_for_path(const char *path, } } +void init_pathspec_bloom_fields(struct repository *r, + struct pathspec *pathspec) +{ + const unsigned bloom_compatible_magic = PATHSPEC_LITERAL; + struct commit_graph *graph = r->objects->commit_graph; + int i; + + if (!graph) + return; + if (!graph->chunk_mpbf_index) + return; + if (!pathspec->nr) + return; + if (pathspec->has_wildcard) + return; + if (pathspec->magic & ~bloom_compatible_magic) + return; + + for (i = 0; i < pathspec->nr; i++) { + struct pathspec_item *pi = &pathspec->items[i]; + const char *path = pi->match; + size_t len = pi->len; + + /* + * Pathspec parsing has normalized away any consecutive + * slashes, but a trailing slash might still be present, + * "remove" it. + */ + if (path[len - 1] == '/') + len--; + + pi->modified_path_bloom_hashes_nr = graph->num_modified_path_bloom_hashes; + ALLOC_ARRAY(pi->modified_path_bloom_hashes, + pi->modified_path_bloom_hashes_nr); + + compute_modified_path_bloom_hashes_for_path(path, len, + graph->num_modified_path_bloom_hashes, + pi->modified_path_bloom_hashes); + } + + pathspec->can_use_modified_path_bloom_filters = 1; +} + struct packed_commit_list { struct commit **list; int nr; diff --git a/commit-graph.h b/commit-graph.h index 847cd25cfc..09dfc16932 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -11,6 +11,8 @@ struct commit; struct repository; struct raw_object_store; struct string_list; +struct bloom_filter; +struct pathspec; char *get_commit_graph_filename(const char *obj_dir); int open_commit_graph(const char *graph_file, int *fd, struct stat *st); @@ -38,6 +40,12 @@ void load_commit_graph_info(struct repository *r, struct commit *item); struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit *c); +enum bloom_result check_modified_path_bloom_filter(struct repository *r, + struct pathspec *pathspec, struct commit *parent, + struct commit *commit); +void init_pathspec_bloom_fields(struct repository *r, + struct pathspec *pathspec); + struct commit_graph { int graph_fd; @@ -59,6 +67,11 @@ struct commit_graph { const unsigned char *chunk_commit_data; const unsigned char *chunk_extra_edges; const unsigned char *chunk_base_graphs; + const unsigned char *chunk_mpbf_index; + uint64_t chunk_mpbf_index_size; + const unsigned char *chunk_mpbf_excludes; + + uint8_t num_modified_path_bloom_hashes; }; struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st); diff --git a/pathspec.c b/pathspec.c index 128f27fcb7..01e7ae6944 100644 --- a/pathspec.c +++ b/pathspec.c @@ -406,6 +406,8 @@ static void init_pathspec_item(struct pathspec_item *item, unsigned flags, item->attr_check = NULL; item->attr_match = NULL; item->attr_match_nr = 0; + item->modified_path_bloom_hashes_nr = 0; + item->modified_path_bloom_hashes = NULL; /* PATHSPEC_LITERAL_PATH ignores magic */ if (flags & PATHSPEC_LITERAL_PATH) { @@ -674,6 +676,12 @@ void copy_pathspec(struct pathspec *dst, const struct pathspec *src) } d->attr_check = attr_check_dup(s->attr_check); + + ALLOC_ARRAY(d->modified_path_bloom_hashes, + d->modified_path_bloom_hashes_nr); + COPY_ARRAY(d->modified_path_bloom_hashes, + s->modified_path_bloom_hashes, + d->modified_path_bloom_hashes_nr); } } @@ -684,6 +692,7 @@ void clear_pathspec(struct pathspec *pathspec) for (i = 0; i < pathspec->nr; i++) { free(pathspec->items[i].match); free(pathspec->items[i].original); + free(pathspec->items[i].modified_path_bloom_hashes); for (j = 0; j < pathspec->items[i].attr_match_nr; j++) free(pathspec->items[i].attr_match[j].value); diff --git a/pathspec.h b/pathspec.h index 454ce364fa..0c661e1b03 100644 --- a/pathspec.h +++ b/pathspec.h @@ -32,6 +32,7 @@ struct pathspec { unsigned int has_wildcard:1; unsigned int recursive:1; unsigned int recurse_submodules:1; + unsigned int can_use_modified_path_bloom_filters:1; unsigned magic; int max_depth; struct pathspec_item { @@ -52,6 +53,17 @@ struct pathspec { } match_mode; } *attr_match; struct attr_check *attr_check; + + /* + * These fields are only relevant during pathspec-limited + * revision walks, init_pathspec_item() leaves them + * zero-initialized, but copy_pathspec() copies them, + * and clear_pathspec() releases the allocated memory. + * IOW: they are only valid if 'struct pathspec's + * 'can_use_modified_path_bloom_filters' bit is set. + */ + uint32_t modified_path_bloom_hashes_nr; + uint32_t *modified_path_bloom_hashes; } *items; }; diff --git a/revision.c b/revision.c index 9dac1262ef..6ec4bc0cb1 100644 --- a/revision.c +++ b/revision.c @@ -29,6 +29,7 @@ #include "prio-queue.h" #include "hashmap.h" #include "utf8.h" +#include "bloom-filter.h" volatile show_early_output_fn_t show_early_output; @@ -629,6 +630,7 @@ static int rev_compare_tree(struct rev_info *revs, { struct tree *t1 = get_commit_tree(parent); struct tree *t2 = get_commit_tree(commit); + enum bloom_result bloom_ret; if (!t1) return REV_TREE_NEW; @@ -653,6 +655,12 @@ static int rev_compare_tree(struct rev_info *revs, return REV_TREE_SAME; } + bloom_ret = check_modified_path_bloom_filter(revs->repo, + &revs->pruning.pathspec, + parent, commit); + if (bloom_ret == BLOOM_DEFINITELY_NOT) + return REV_TREE_SAME; + tree_difference = REV_TREE_SAME; revs->pruning.flags.has_changes = 0; diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning); @@ -662,10 +670,17 @@ static int rev_compare_tree(struct rev_info *revs, static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit) { struct tree *t1 = get_commit_tree(commit); + enum bloom_result bloom_ret; if (!t1) return 0; + bloom_ret = check_modified_path_bloom_filter(revs->repo, + &revs->pruning.pathspec, + NULL, commit); + if (bloom_ret == BLOOM_DEFINITELY_NOT) + return 1; + tree_difference = REV_TREE_SAME; revs->pruning.flags.has_changes = 0; diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning); @@ -3376,6 +3391,9 @@ int prepare_revision_walk(struct rev_info *revs) simplify_merges(revs); if (revs->children.name) set_children(revs); + + init_pathspec_bloom_fields(revs->repo, &revs->pruning.pathspec); + return 0; } -- 2.27.0.rc1.431.g5c813f95dc