Re: [PATCH 5/9] commit-graph: write changed path bloom filters to commit-graph file.

Jakub Narebski <jnareb@xxxxxxxxx> · Tue, 07 Jan 2020 17:01:50 +0100

"Garima Singh via GitGitGadget" <gitgitgadget@xxxxxxxxx> writes:

> From: Garima Singh <garima.singh@xxxxxxxxxxxxx>
>
> Write bloom filters to the commit-graph using the format described in
> Documentation/technical/commit-graph-format.txt
>
> Helped-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx>
> Signed-off-by: Garima Singh <garima.singh@xxxxxxxxxxxxx>

Looks good to me.

> ---
>  commit-graph.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  commit-graph.h |  5 ++++
>  2 files changed, 85 insertions(+), 1 deletion(-)
>
> diff --git a/commit-graph.c b/commit-graph.c
> index 8c4941eeaa..def2ade166 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -24,7 +24,9 @@
>  #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
>  #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
>  #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
> -#define MAX_NUM_CHUNKS 5
> +#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
> +#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
> +#define MAX_NUM_CHUNKS 7

Very minor nitpick: shouldn't we follow the order in the
commit-graph-format.txt document (i.e. "BASE" as last chunk and last
preprocessor constant)?

>  
>  #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
>  
> @@ -282,6 +284,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
>  				chunk_repeated = 1;
>  			else
>  				graph->chunk_base_graphs = data + chunk_offset;
> +			break;
> +
> +		case GRAPH_CHUNKID_BLOOMINDEXES:
> +			if (graph->chunk_bloom_indexes)
> +				chunk_repeated = 1;
> +			else
> +				graph->chunk_bloom_indexes = data + chunk_offset;
> +			break;

All right.

> +
> +		case GRAPH_CHUNKID_BLOOMDATA:
> +			if (graph->chunk_bloom_data)
> +				chunk_repeated = 1;
> +			else {
> +				uint32_t hash_version;
> +				graph->chunk_bloom_data = data + chunk_offset;
> +				hash_version = get_be32(data + chunk_offset);

All right, now I see why all those header values for BDAT chunk are
defined to be 32-bit integers.  For code simplicity.

> +
> +				if (hash_version != 1)
> +					break;

What does it mean for Git?  Behave as if there were no Bloom filter
data?

> +
> +				graph->settings = xmalloc(sizeof(struct bloom_filter_settings));
> +				graph->settings->hash_version = hash_version;
> +				graph->settings->num_hashes = get_be32(data + chunk_offset + 4);
> +				graph->settings->bits_per_entry = get_be32(data + chunk_offset + 8);

All right, looks O.K.

> +			}
> +			break;
>  		}
>  
>  		if (chunk_repeated) {
> @@ -996,6 +1024,39 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
>  	}
>  }
>  
> +static void write_graph_chunk_bloom_indexes(struct hashfile *f,
> +					    struct write_commit_graph_context *ctx)
> +{
> +	struct commit **list = ctx->commits.list;
> +	struct commit **last = ctx->commits.list + ctx->commits.nr;
> +	uint32_t cur_pos = 0;
> +
> +	while (list < last) {
> +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
> +		cur_pos += filter->len;
> +		hashwrite_be32(f, cur_pos);
> +		list++;
> +	}

Why not follow the write_graph_chunk_oids() example, instead of
write_graph_chunk_data(), that is use simply:

  +	struct commit **list = ctx->commits.list;
  +	uint32_t cur_pos = 0;
  +
  +	for (count = 0; count < ctx->commits.nr; count++, list++) {
  +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
  +		cur_pos += filter->len;
  +		hashwrite_be32(f, cur_pos);
  +	}

I guess using here

  +		cur_pos += get_bloom_filter(ctx->r, *list)->len;

would be too cryptic, and hard to debug?

Also, wouldn't we need

  +		display_progress(ctx->progress, ++ctx->progress_cnt);

before hashwrite_be32()?

> +}
> +
> +static void write_graph_chunk_bloom_data(struct hashfile *f,
> +					 struct write_commit_graph_context *ctx,
> +					 struct bloom_filter_settings *settings)
> +{
> +	struct commit **first = ctx->commits.list;

Even if we decide to use `while` loop, like write_graph_chunk_data(),
and not `for` loop, like write_graph_chunk_oids(), why the change from
`struct commit **list = ...` to `struct commit **first = ...`?

> +	struct commit **last = ctx->commits.list + ctx->commits.nr;
> +
> +	hashwrite_be32(f, settings->hash_version);
> +	hashwrite_be32(f, settings->num_hashes);
> +	hashwrite_be32(f, settings->bits_per_entry);

All right, simple.

> +
> +	while (first < last) {
> +		struct bloom_filter *filter = get_bloom_filter(ctx->r, *first);

Hmmm... wouldn't this compute Bloom filter second time?
get_bloom_filter() does work unconditionally.

Wouldn't

  +		struct bloom_filter *filter = bloom_filter_slab_at(&bloom_filters, *first);

be enough?

Or make get_bloom_filter() use *_peek() to check if Bloom filter for
given commit was already computed, and only if it returns NULL do the
work.

> +		hashwrite(f, filter->data, filter->len * sizeof(uint64_t));

Might need display_progress() before hashwrote().

> +		first++;
> +	}
> +}
> +
>  static int oid_compare(const void *_a, const void *_b)
>  {
>  	const struct object_id *a = (const struct object_id *)_a;
> @@ -1388,6 +1449,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  	struct strbuf progress_title = STRBUF_INIT;
>  	int num_chunks = 3;
>  	struct object_id file_hash;
> +	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
>  
>  	if (ctx->split) {
>  		struct strbuf tmp_file = STRBUF_INIT;
> @@ -1432,6 +1494,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
>  		num_chunks++;
>  	}
> +	if (ctx->bloom) {
> +		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
> +		num_chunks++;
> +		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
> +		num_chunks++;
> +	}

Looks all right.

>  	if (ctx->num_commit_graphs_after > 1) {
>  		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
>  		num_chunks++;
> @@ -1450,6 +1518,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  						4 * ctx->num_extra_edges;
>  		num_chunks++;
>  	}
> +	if (ctx->bloom) {
> +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * ctx->commits.nr;
> +		num_chunks++;
> +
> +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size;
> +		num_chunks++;
> +	}

Better wrap those long lines, like above:

  +	if (ctx->bloom) {
  +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
  +						sizeof(uint32_t) * ctx->commits.nr;
  +		num_chunks++;
  +
  +		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
  +						sizeof(uint32_t) * 3 + ctx->total_bloom_filter_size;
  +		num_chunks++;
  +	}

>  	if (ctx->num_commit_graphs_after > 1) {
>  		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
>  						hashsz * (ctx->num_commit_graphs_after - 1);
> @@ -1487,6 +1562,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
>  	write_graph_chunk_data(f, hashsz, ctx);
>  	if (ctx->num_extra_edges)
>  		write_graph_chunk_extra_edges(f, ctx);
> +	if (ctx->bloom) {
> +		write_graph_chunk_bloom_indexes(f, ctx);
> +		write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
> +	}

All right.

>  	if (ctx->num_commit_graphs_after > 1 &&
>  	    write_graph_chunk_base(f, ctx)) {
>  		return -1;
> diff --git a/commit-graph.h b/commit-graph.h
> index 952a4b83be..2202ad91ae 100644
> --- a/commit-graph.h
> +++ b/commit-graph.h
> @@ -10,6 +10,7 @@
>  #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
>  
>  struct commit;
> +struct bloom_filter_settings;
>  
>  char *get_commit_graph_filename(const char *obj_dir);
>  int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
> @@ -58,6 +59,10 @@ struct commit_graph {
>  	const unsigned char *chunk_commit_data;
>  	const unsigned char *chunk_extra_edges;
>  	const unsigned char *chunk_base_graphs;
> +	const unsigned char *chunk_bloom_indexes;
> +	const unsigned char *chunk_bloom_data;
> +
> +	struct bloom_filter_settings *settings;

Should this be part of `struct commit_graph`?  Shouldn't we free() this
data, or is it a pointer into xmmap-ped file... no it isn't -- we
xalloc() it, so we should free() it.

I think it should be done in 'cleanup:' section of write_commit_graph(),
but I am not entirely sure.

>  };
>  
>  struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st);

Best,
-- 
Jakub Narębski