Some commits modify the same set of paths, and, consequently, have identical modified path Bloom filters. Use a hashmap to find these identical Bloom filters, and omit all duplicates from the Modified Path Bloom Filters chunk, reducing its size. MPBF chunk size without with Time spent deduplication deduping -------------------------------------------------------- android-base 8620198 2618764 -69.6% 0.089s cmssw 10347018 9094468 -12.1% 0.056s cpython 526381 371629 -29.4% 0.013s elasticsearch 4733274 3607106 -23.8% 0.029s gcc 3724544 3394521 -8.9% 0.072s gecko-dev 20591010 17042732 -17.2% 0.235s git 160718 139546 -13.2% 0.004s glibc 759132 699623 -7.8% 0.009s go 458375 421394 -8.1% 0.008s jdk 13213208 12158533 -8.0% 0.034s linux 26575278 25029700 -5.8% 0.169s llvm-project 3988133 3269438 -18.0% 0.085s rails 958691 614528 -35.9% 0.015s rust 1985782 1682919 -15.3% 0.023s tensorflow 4173570 3789768 -9.2% 0.022s webkit 5891751 5394507 -8.4% 0.096s Signed-off-by: SZEDER Gábor <szeder.dev@xxxxxxxxx> --- commit-graph.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 56b8f33d10..178bbf0113 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -57,6 +57,7 @@ struct modified_path_bloom_filter_info { struct bloom_filter filter; + struct modified_path_bloom_filter_info *duplicate_of; /* * The offset relative to the start of the Modified Path Bloom * Filters chunk where this Bloom filter has been written, @@ -1099,6 +1100,9 @@ struct write_commit_graph_context { /* Excluding embedded modified path Bloom filters */ uint64_t total_filter_size; + + /* Used to find identical modified path Bloom filters */ + struct hashmap dedup_hashmap; } mpbfctx; }; @@ -1315,7 +1319,11 @@ static int write_graph_chunk_modified_path_bloom_filters(struct hashfile *f, bfi = modified_path_bloom_filters_peek( &modified_path_bloom_filters, commit); - if (!bfi || !bfi->filter.nr_bits) + if (!bfi) + continue; + if (bfi->duplicate_of) + continue; + if (!bfi->filter.nr_bits) continue; if (bfi->filter.nr_bits == GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS) continue; @@ -1364,6 +1372,9 @@ static int write_graph_chunk_modified_path_bloom_index(struct hashfile *f, */ filterdata[0] |= 1 << 7; hashwrite(f, filterdata, sizeof(filterdata)); + } else if (bfi->duplicate_of) { + uint64_t offset = htonll(bfi->duplicate_of->offset); + hashwrite(f, &offset, sizeof(offset)); } else if (bfi->offset != -1) { uint64_t offset = htonll(bfi->offset); hashwrite(f, &offset, sizeof(offset)); @@ -1515,6 +1526,53 @@ static void file_change_cb(struct diff_options *options, handle_modified_file(options->change_fn_data, fullpath); } +struct modified_path_bloom_filter_dedup_entry { + struct hashmap_entry entry; + struct modified_path_bloom_filter_info *bfi; +}; + +static int modified_path_bloom_filter_dedup_cmp(const void *unused_cmp_data, + const struct hashmap_entry *he1, + const struct hashmap_entry *he2, + const void *unused_keydata) +{ + const struct modified_path_bloom_filter_dedup_entry *a, *b; + + a = container_of(he1, const struct modified_path_bloom_filter_dedup_entry, entry); + b = container_of(he2, const struct modified_path_bloom_filter_dedup_entry, entry); + + if (a->bfi->filter.nr_bits != b->bfi->filter.nr_bits) + return 1; + + return memcmp(a->bfi->filter.bits, b->bfi->filter.bits, + bloom_filter_bytes(&a->bfi->filter)); +} + +static int handle_duplicate_modified_path_bloom_filter( + struct modified_path_bloom_filter_context *mpbfctx, + struct modified_path_bloom_filter_info *bfi) +{ + struct modified_path_bloom_filter_dedup_entry *e, stack_entry; + unsigned int h; + + h = memhash(bfi->filter.bits, bloom_filter_bytes(&bfi->filter)); + hashmap_entry_init(&stack_entry.entry, h); + stack_entry.bfi = bfi; + + e = hashmap_get_entry(&mpbfctx->dedup_hashmap, &stack_entry, entry, + NULL); + if (e) { + bfi->duplicate_of = e->bfi; + return 1; + } else { + e = xmalloc(sizeof(*e)); + hashmap_entry_init(&e->entry, h); + e->bfi = bfi; + hashmap_add(&mpbfctx->dedup_hashmap, &e->entry); + return 0; + } +} + static void create_modified_path_bloom_filter( struct modified_path_bloom_filter_context *mpbfctx, struct commit *commit) @@ -1547,17 +1605,20 @@ static void create_modified_path_bloom_filter( bfi = modified_path_bloom_filters_at(&modified_path_bloom_filters, commit); bfi->offset = -1; - if (path_component_count > mpbfctx->embedded_limit) { + if (path_component_count > mpbfctx->embedded_limit) bloom_filter_init(&bfi->filter, mpbfctx->num_hashes, path_component_count); - mpbfctx->total_filter_size += sizeof(uint32_t) + - bloom_filter_bytes(&bfi->filter); - } else + else bloom_filter_init_with_size(&bfi->filter, GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS); bloom_filter_set_bits(&bfi->filter, mpbfctx->hashes, mpbfctx->hashes_nr); + + if (path_component_count > mpbfctx->embedded_limit && + !handle_duplicate_modified_path_bloom_filter(mpbfctx, bfi)) + mpbfctx->total_filter_size += sizeof(uint32_t) + + bloom_filter_bytes(&bfi->filter); } static void add_missing_parents(struct write_commit_graph_context *ctx, struct commit *commit) @@ -2396,6 +2457,8 @@ int write_commit_graph(const char *obj_dir, diff_setup_done(&ctx->mpbfctx.diffopt); strbuf_init(&ctx->mpbfctx.prev_path, PATH_MAX); + hashmap_init(&ctx->mpbfctx.dedup_hashmap, + modified_path_bloom_filter_dedup_cmp, NULL, 0); } if (ctx->split) { @@ -2525,6 +2588,9 @@ int write_commit_graph(const char *obj_dir, strbuf_release(&ctx->mpbfctx.prev_path); free(ctx->mpbfctx.hashed_prefix_lengths); free(ctx->mpbfctx.hashes); + hashmap_free_entries(&ctx->mpbfctx.dedup_hashmap, + struct modified_path_bloom_filter_dedup_entry, + entry); } free(ctx); -- 2.27.0.rc1.431.g5c813f95dc