On Mon, Apr 06, 2020 at 04:59:50PM +0000, Garima Singh via GitGitGadget wrote: > From: Garima Singh <garima.singh@xxxxxxxxxxxxx> > > Add logic to > a) parse Bloom filter information from the commit graph file and, > b) re-use existing Bloom filters. > > See Documentation/technical/commit-graph-format for the format in which > the Bloom filter information is written to the commit graph file. > > To read Bloom filter for a given commit with lexicographic position > 'i' we need to: > 1. Read BIDX[i] which essentially gives us the starting index in BDAT for > filter of commit i+1. It is essentially the index past the end > of the filter of commit i. It is called end_index in the code. > > 2. For i>0, read BIDX[i-1] which will give us the starting index in BDAT > for filter of commit i. It is called the start_index in the code. > For the first commit, where i = 0, Bloom filter data starts at the > beginning, just past the header in the BDAT chunk. Hence, start_index > will be 0. > > 3. The length of the filter will be end_index - start_index, because > BIDX[i] gives the cumulative 8-byte words including the ith > commit's filter. > > We toggle whether Bloom filters should be recomputed based on the > compute_if_not_present flag. > > Helped-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx> > Signed-off-by: Garima Singh <garima.singh@xxxxxxxxxxxxx> > --- > bloom.c | 49 ++++++++++++++++++++++++++++++++++++++++++- > bloom.h | 4 +++- > commit-graph.c | 6 +++--- > t/helper/test-bloom.c | 2 +- > 4 files changed, 55 insertions(+), 6 deletions(-) > > diff --git a/bloom.c b/bloom.c > index a16eee92331..0f714dd76ae 100644 > --- a/bloom.c > +++ b/bloom.c > @@ -4,6 +4,8 @@ > #include "diffcore.h" > #include "revision.h" > #include "hashmap.h" > +#include "commit-graph.h" > +#include "commit.h" > > define_commit_slab(bloom_filter_slab, struct bloom_filter); > > @@ -26,6 +28,36 @@ static inline unsigned char get_bitmask(uint32_t pos) > return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); > } > > +static int load_bloom_filter_from_graph(struct commit_graph *g, > + struct bloom_filter *filter, > + struct commit *c) > +{ > + uint32_t lex_pos, start_index, end_index; > + > + while (c->graph_pos < g->num_commits_in_base) > + g = g->base_graph; > + > + /* The commit graph commit 'c' lives in doesn't carry bloom filters. */ > + if (!g->chunk_bloom_indexes) > + return 0; > + > + lex_pos = c->graph_pos - g->num_commits_in_base; > + > + end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos); Let's suppose that we encounter a bogus commit-graph file. This would then segfault if 'lex_pos' were to point past the end of file, i.e. past the mmap()-ed memory region. > + > + if (lex_pos > 0) > + start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1)); > + else > + start_index = 0; > + > + filter->len = end_index - start_index; > + filter->data = (unsigned char *)(g->chunk_bloom_data + > + sizeof(unsigned char) * start_index + > + BLOOMDATA_CHUNK_HEADER_SIZE); And this could lead to segfault later when accessing the Bloom filter data if 'start_index' or 'end_index' were to point past EOF or end_index < start_index. IMO all indices and offsets read from the commit-graph file must be checked to ensure that they fit in the corresponding chunk, like I did in my modified path Bloom filters implementation. However, I'm not sure how it's best to handle an out-of-bounds offset... Simply erroring out in case of a bogus commit-graph file is the straightforward possibility, of course, but since the commit-graph is only an optimization, it would be better user experience to warn and ignore it and finish the operation without the commit-graph (albeit slower). But is it even possible to ignore the commit-graph, say, in the middle of a 'git rev-list --topo-order HEAD'? > + return 1; > +} > + > /* > * Calculate the murmur3 32-bit hash value for the given data > * using the given seed. > @@ -127,7 +159,8 @@ void init_bloom_filters(void) > } > > struct bloom_filter *get_bloom_filter(struct repository *r, > - struct commit *c) > + struct commit *c, > + int compute_if_not_present) > { > struct bloom_filter *filter; > struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; > @@ -140,6 +173,20 @@ struct bloom_filter *get_bloom_filter(struct repository *r, > > filter = bloom_filter_slab_at(&bloom_filters, c); > > + if (!filter->data) { > + load_commit_graph_info(r, c); > + if (c->graph_pos != COMMIT_NOT_FROM_GRAPH && > + r->objects->commit_graph->chunk_bloom_indexes) { > + if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c)) > + return filter; > + else > + return NULL; > + } > + } > + > + if (filter->data || !compute_if_not_present) > + return filter; > + > repo_diff_setup(r, &diffopt); > diffopt.flags.recursive = 1; > diffopt.max_changes = max_changes; > diff --git a/bloom.h b/bloom.h > index 85ab8e9423d..760d7122374 100644 > --- a/bloom.h > +++ b/bloom.h > @@ -32,6 +32,7 @@ struct bloom_filter_settings { > > #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 } > #define BITS_PER_WORD 8 > +#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t) > > /* > * A bloom_filter struct represents a data segment to > @@ -79,6 +80,7 @@ void add_key_to_filter(const struct bloom_key *key, > void init_bloom_filters(void); > > struct bloom_filter *get_bloom_filter(struct repository *r, > - struct commit *c); > + struct commit *c, > + int compute_if_not_present); > > #endif > \ No newline at end of file > diff --git a/commit-graph.c b/commit-graph.c > index a8b6b5cca5d..77668629e27 100644 > --- a/commit-graph.c > +++ b/commit-graph.c > @@ -1086,7 +1086,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, > ctx->commits.nr); > > while (list < last) { > - struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); > + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); > cur_pos += filter->len; > display_progress(progress, ++i); > hashwrite_be32(f, cur_pos); > @@ -1115,7 +1115,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, > hashwrite_be32(f, settings->bits_per_entry); > > while (list < last) { > - struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); > + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); > display_progress(progress, ++i); > hashwrite(f, filter->data, filter->len * sizeof(unsigned char)); > list++; > @@ -1296,7 +1296,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx) > > for (i = 0; i < ctx->commits.nr; i++) { > struct commit *c = sorted_commits[i]; > - struct bloom_filter *filter = get_bloom_filter(ctx->r, c); > + struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1); > ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len; > display_progress(progress, i + 1); > } > diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c > index f18d1b722e1..ce412664ba9 100644 > --- a/t/helper/test-bloom.c > +++ b/t/helper/test-bloom.c > @@ -39,7 +39,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) > struct bloom_filter *filter; > setup_git_directory(); > c = lookup_commit(the_repository, commit_oid); > - filter = get_bloom_filter(the_repository, c); > + filter = get_bloom_filter(the_repository, c, 1); > print_bloom_filter(filter); > } > > -- > gitgitgadget >