"Garima Singh via GitGitGadget" <gitgitgadget@xxxxxxxxx> writes: > From: Garima Singh <garima.singh@xxxxxxxxxxxxx> > > Write bloom filters to the commit-graph using the format described in > Documentation/technical/commit-graph-format.txt > > Helped-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx> > Signed-off-by: Garima Singh <garima.singh@xxxxxxxxxxxxx> Looks good to me. > --- > commit-graph.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++- > commit-graph.h | 5 ++++ > 2 files changed, 85 insertions(+), 1 deletion(-) > > diff --git a/commit-graph.c b/commit-graph.c > index 8c4941eeaa..def2ade166 100644 > --- a/commit-graph.c > +++ b/commit-graph.c > @@ -24,7 +24,9 @@ > #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */ > #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */ > #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ > -#define MAX_NUM_CHUNKS 5 > +#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */ > +#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */ > +#define MAX_NUM_CHUNKS 7 Very minor nitpick: shouldn't we follow the order in the commit-graph-format.txt document (i.e. "BASE" as last chunk and last preprocessor constant)? > > #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) > > @@ -282,6 +284,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd, > chunk_repeated = 1; > else > graph->chunk_base_graphs = data + chunk_offset; > + break; > + > + case GRAPH_CHUNKID_BLOOMINDEXES: > + if (graph->chunk_bloom_indexes) > + chunk_repeated = 1; > + else > + graph->chunk_bloom_indexes = data + chunk_offset; > + break; All right. > + > + case GRAPH_CHUNKID_BLOOMDATA: > + if (graph->chunk_bloom_data) > + chunk_repeated = 1; > + else { > + uint32_t hash_version; > + graph->chunk_bloom_data = data + chunk_offset; > + hash_version = get_be32(data + chunk_offset); All right, now I see why all those header values for BDAT chunk are defined to be 32-bit integers. For code simplicity. > + > + if (hash_version != 1) > + break; What does it mean for Git? Behave as if there were no Bloom filter data? > + > + graph->settings = xmalloc(sizeof(struct bloom_filter_settings)); > + graph->settings->hash_version = hash_version; > + graph->settings->num_hashes = get_be32(data + chunk_offset + 4); > + graph->settings->bits_per_entry = get_be32(data + chunk_offset + 8); All right, looks O.K. > + } > + break; > } > > if (chunk_repeated) { > @@ -996,6 +1024,39 @@ static void write_graph_chunk_extra_edges(struct hashfile *f, > } > } > > +static void write_graph_chunk_bloom_indexes(struct hashfile *f, > + struct write_commit_graph_context *ctx) > +{ > + struct commit **list = ctx->commits.list; > + struct commit **last = ctx->commits.list + ctx->commits.nr; > + uint32_t cur_pos = 0; > + > + while (list < last) { > + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); > + cur_pos += filter->len; > + hashwrite_be32(f, cur_pos); > + list++; > + } Why not follow the write_graph_chunk_oids() example, instead of write_graph_chunk_data(), that is use simply: + struct commit **list = ctx->commits.list; + uint32_t cur_pos = 0; + + for (count = 0; count < ctx->commits.nr; count++, list++) { + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); + cur_pos += filter->len; + hashwrite_be32(f, cur_pos); + } I guess using here + cur_pos += get_bloom_filter(ctx->r, *list)->len; would be too cryptic, and hard to debug? Also, wouldn't we need + display_progress(ctx->progress, ++ctx->progress_cnt); before hashwrite_be32()? > +} > + > +static void write_graph_chunk_bloom_data(struct hashfile *f, > + struct write_commit_graph_context *ctx, > + struct bloom_filter_settings *settings) > +{ > + struct commit **first = ctx->commits.list; Even if we decide to use `while` loop, like write_graph_chunk_data(), and not `for` loop, like write_graph_chunk_oids(), why the change from `struct commit **list = ...` to `struct commit **first = ...`? > + struct commit **last = ctx->commits.list + ctx->commits.nr; > + > + hashwrite_be32(f, settings->hash_version); > + hashwrite_be32(f, settings->num_hashes); > + hashwrite_be32(f, settings->bits_per_entry); All right, simple. > + > + while (first < last) { > + struct bloom_filter *filter = get_bloom_filter(ctx->r, *first); Hmmm... wouldn't this compute Bloom filter second time? get_bloom_filter() does work unconditionally. Wouldn't + struct bloom_filter *filter = bloom_filter_slab_at(&bloom_filters, *first); be enough? Or make get_bloom_filter() use *_peek() to check if Bloom filter for given commit was already computed, and only if it returns NULL do the work. > + hashwrite(f, filter->data, filter->len * sizeof(uint64_t)); Might need display_progress() before hashwrote(). > + first++; > + } > +} > + > static int oid_compare(const void *_a, const void *_b) > { > const struct object_id *a = (const struct object_id *)_a; > @@ -1388,6 +1449,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) > struct strbuf progress_title = STRBUF_INIT; > int num_chunks = 3; > struct object_id file_hash; > + struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; > > if (ctx->split) { > struct strbuf tmp_file = STRBUF_INIT; > @@ -1432,6 +1494,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) > chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES; > num_chunks++; > } > + if (ctx->bloom) { > + chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES; > + num_chunks++; > + chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA; > + num_chunks++; > + } Looks all right. > if (ctx->num_commit_graphs_after > 1) { > chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE; > num_chunks++; > @@ -1450,6 +1518,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) > 4 * ctx->num_extra_edges; > num_chunks++; > } > + if (ctx->bloom) { > + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * ctx->commits.nr; > + num_chunks++; > + > + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size; > + num_chunks++; > + } Better wrap those long lines, like above: + if (ctx->bloom) { + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + + sizeof(uint32_t) * ctx->commits.nr; + num_chunks++; + + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size; + num_chunks++; + } > if (ctx->num_commit_graphs_after > 1) { > chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + > hashsz * (ctx->num_commit_graphs_after - 1); > @@ -1487,6 +1562,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) > write_graph_chunk_data(f, hashsz, ctx); > if (ctx->num_extra_edges) > write_graph_chunk_extra_edges(f, ctx); > + if (ctx->bloom) { > + write_graph_chunk_bloom_indexes(f, ctx); > + write_graph_chunk_bloom_data(f, ctx, &bloom_settings); > + } All right. > if (ctx->num_commit_graphs_after > 1 && > write_graph_chunk_base(f, ctx)) { > return -1; > diff --git a/commit-graph.h b/commit-graph.h > index 952a4b83be..2202ad91ae 100644 > --- a/commit-graph.h > +++ b/commit-graph.h > @@ -10,6 +10,7 @@ > #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD" > > struct commit; > +struct bloom_filter_settings; > > char *get_commit_graph_filename(const char *obj_dir); > int open_commit_graph(const char *graph_file, int *fd, struct stat *st); > @@ -58,6 +59,10 @@ struct commit_graph { > const unsigned char *chunk_commit_data; > const unsigned char *chunk_extra_edges; > const unsigned char *chunk_base_graphs; > + const unsigned char *chunk_bloom_indexes; > + const unsigned char *chunk_bloom_data; > + > + struct bloom_filter_settings *settings; Should this be part of `struct commit_graph`? Shouldn't we free() this data, or is it a pointer into xmmap-ped file... no it isn't -- we xalloc() it, so we should free() it. I think it should be done in 'cleanup:' section of write_commit_graph(), but I am not entirely sure. > }; > > struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st); Best, -- Jakub Narębski