On Thu, Dec 03, 2020 at 04:16:49PM +0000, Derrick Stolee via GitGitGadget wrote: > From: Derrick Stolee <dstolee@xxxxxxxxxxxxx> > > The chunk-format API allows automatically writing the table of contents > for a chunk-based file format when using an array of "struct > chunk_info"s. Update write_midx_internal() to use this strategy, which > also simplifies the chunk writing loop. This loop will be replaced with > a chunk-format API call in an upcoming change. > > Signed-off-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx> > --- > midx.c | 96 +++++++++++++--------------------------------------------- > 1 file changed, 21 insertions(+), 75 deletions(-) > > diff --git a/midx.c b/midx.c > index ce6d4339bd..0548266bea 100644 > --- a/midx.c > +++ b/midx.c > @@ -11,6 +11,7 @@ > #include "trace2.h" > #include "run-command.h" > #include "repository.h" > +#include "chunk-format.h" > > #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ > #define MIDX_VERSION 1 > @@ -799,15 +800,14 @@ static int write_midx_large_offsets(struct hashfile *f, > static int write_midx_internal(const char *object_dir, struct multi_pack_index *m, > struct string_list *packs_to_drop, unsigned flags) > { > - unsigned char cur_chunk, num_chunks = 0; > + unsigned char num_chunks = 0; > char *midx_name; > uint32_t i; > struct hashfile *f = NULL; > struct lock_file lk; > struct write_midx_context ctx = { 0 }; > uint64_t header_size = 0; > - uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; > - uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; > + struct chunk_info chunks[MIDX_MAX_CHUNKS]; > int pack_name_concat_len = 0; > int dropped_packs = 0; > int result = 0; > @@ -923,7 +923,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * > if (ctx.m) > close_midx(ctx.m); > > - cur_chunk = 0; > num_chunks = ctx.large_offsets_needed ? 5 : 4; > > if (ctx.nr - dropped_packs == 0) { > @@ -934,85 +933,32 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * > > header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); > > - chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; > - chunk_offsets[cur_chunk] = header_size + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; > + chunks[0].id = MIDX_CHUNKID_PACKNAMES; > + chunks[0].size = pack_name_concat_len; > + chunks[0].write_fn = write_midx_pack_names; [...] Hmm. The caller has to do quite a lot of work in order to feed chunks into the new chunk-format code. I wonder what it would look like if we introduced a new 'struct chunkfile' type. It would know about the underlying hashfile that it's writing to, as well as the chunks it's supposed to write. It would likely support three operations: add a new chunk, write the TOC, and write the contents (the latter two could probably be combined into one). Below is a patch to do just that. It seems to work OK on t5319, but fails some tests in t5318. I haven't looked into the commit-graph test failures, but I'd be happy to do so if you think this is a direction worth going in. Before I show the patch, though, let's take a look at the stat: chunk-format.c | 61 ++++++++++++++++++++++++++++++++++++++---------- chunk-format.h | 25 ++++++++++++-------- commit-graph.c | 90 ++++++++++++++++++++++------------------------------------------------- midx.c | 38 +++++++++++++----------------- 4 files changed, 107 insertions(+), 107 deletions(-) That's a little bit more code in the chunk-format code, which I can live with, but the diff in commit-graph.c and midx.c is a net-negative, as I would expect it to be. So, I'm happy that this seems to make things easier on the callers. OK, enough rambling. Here's the patch: diff --git a/chunk-format.c b/chunk-format.c index 771b6d98d0..14f2fe5c9a 100644 --- a/chunk-format.c +++ b/chunk-format.c @@ -1,26 +1,63 @@ #include "git-compat-util.h" #include "chunk-format.h" #include "csum-file.h" +#include "cache.h" #define CHUNK_LOOKUP_WIDTH 12 -void write_table_of_contents(struct hashfile *f, - uint64_t cur_offset, - struct chunk_info *chunks, - int nr) +void chunkfile_init(struct chunkfile *out, struct hashfile *f) { - int i; + if (!out) + BUG("cannot initialize null chunkfile"); + + memset(out, 0, sizeof(*out)); + out->f = f; +} + +void chunkfile_push_chunk(struct chunkfile *c, + uint32_t id, uint64_t size, chunk_write_fn write_fn) +{ + ALLOC_GROW(c->chunks, c->chunks_nr + 1, c->chunks_alloc); + c->chunks[c->chunks_nr].id = id; + c->chunks[c->chunks_nr].size = size; + c->chunks[c->chunks_nr].write_fn = write_fn; + c->chunks_nr++; +} + +void chunkfile_write_toc(struct chunkfile *c) +{ + size_t i; + off_t cur_offset = hashfile_total(c->f); /* Add the table of contents to the current offset */ - cur_offset += (nr + 1) * CHUNK_LOOKUP_WIDTH; + cur_offset += (c->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH; - for (i = 0; i < nr; i++) { - hashwrite_be32(f, chunks[i].id); - hashwrite_be64(f, cur_offset); + for (i = 0; i < c->chunks_nr; i++) { + hashwrite_be32(c->f, c->chunks[i].id); + hashwrite_be64(c->f, cur_offset); - cur_offset += chunks[i].size; + cur_offset += c->chunks[i].size; } /* Trailing entry marks the end of the chunks */ - hashwrite_be32(f, 0); - hashwrite_be64(f, cur_offset); + hashwrite_be32(c->f, 0); + hashwrite_be64(c->f, cur_offset); +} + +void chunkfile_write_chunks(struct chunkfile *c, void *ctx) +{ + size_t i; + + for (i = 0; i < c->chunks_nr; i++) { + off_t before, after; + struct chunk_info *chunk = &c->chunks[i]; + if (!chunk->write_fn) + continue; + + before = hashfile_total(c->f); + chunk->write_fn(c->f, ctx); + after = hashfile_total(c->f); + + if (after - before != chunk->size) + BUG("unexpected chunk size"); + } } diff --git a/chunk-format.h b/chunk-format.h index 4b9cbeb372..b0fb515425 100644 --- a/chunk-format.h +++ b/chunk-format.h @@ -19,18 +19,23 @@ struct chunk_info { chunk_write_fn write_fn; }; +struct chunkfile { + struct hashfile *f; + + struct chunk_info *chunks; + size_t chunks_nr; + size_t chunks_alloc; +}; + +void chunkfile_init(struct chunkfile *out, struct hashfile *f); +void chunkfile_push_chunk(struct chunkfile *c, + uint32_t id, uint64_t size, chunk_write_fn write_fn); + /* * Write the chunk data into the supplied hashfile. - * - * * 'cur_offset' indicates the number of bytes written to the hashfile - * before the table of contents starts. - * - * * 'nr' is the number of chunks with non-zero IDs, so 'nr + 1' - * chunks are written in total. */ -void write_table_of_contents(struct hashfile *f, - uint64_t cur_offset, - struct chunk_info *chunks, - int nr); +void chunkfile_write_toc(struct chunkfile *c); + +void chunkfile_write_chunks(struct chunkfile *c, void *ctx); #endif diff --git a/commit-graph.c b/commit-graph.c index 5494fda1d3..abc0c9e46f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1702,11 +1702,9 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) uint32_t i; int fd; struct hashfile *f; + struct chunkfile cf; struct lock_file lk = LOCK_INIT; - struct chunk_info chunks[MAX_NUM_CHUNKS + 1]; const unsigned hashsz = the_hash_algo->rawsz; - struct strbuf progress_title = STRBUF_INIT; - int num_chunks = 3; struct object_id file_hash; if (ctx->split) { @@ -1753,76 +1751,42 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); } - chunks[0].id = GRAPH_CHUNKID_OIDFANOUT; - chunks[0].size = GRAPH_FANOUT_SIZE; - chunks[0].write_fn = write_graph_chunk_fanout; - chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP; - chunks[1].size = hashsz * ctx->commits.nr; - chunks[1].write_fn = write_graph_chunk_oids; - chunks[2].id = GRAPH_CHUNKID_DATA; - chunks[2].size = (hashsz + 16) * ctx->commits.nr; - chunks[2].write_fn = write_graph_chunk_data; - if (ctx->num_extra_edges) { - chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES; - chunks[num_chunks].size = 4 * ctx->num_extra_edges; - chunks[num_chunks].write_fn = write_graph_chunk_extra_edges; - num_chunks++; - } + chunkfile_init(&cf, f); + + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_OIDFANOUT, + GRAPH_FANOUT_SIZE, write_graph_chunk_fanout); + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_OIDLOOKUP, + hashsz * ctx->commits.nr, write_graph_chunk_oids); + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_DATA, + (hashsz + 16) * ctx->commits.nr, write_graph_chunk_data); + if (ctx->num_extra_edges) + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_EXTRAEDGES, + 4 * ctx->num_extra_edges, + write_graph_chunk_extra_edges); if (ctx->changed_paths) { - chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES; - chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr; - chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes; - num_chunks++; - chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA; - chunks[num_chunks].size = sizeof(uint32_t) * 3 - + ctx->total_bloom_filter_data_size; - chunks[num_chunks].write_fn = write_graph_chunk_bloom_data; - num_chunks++; - } - if (ctx->num_commit_graphs_after > 1) { - chunks[num_chunks].id = GRAPH_CHUNKID_BASE; - chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1); - chunks[num_chunks].write_fn = write_graph_chunk_base; - num_chunks++; + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BLOOMINDEXES, + sizeof(uint32_t) * ctx->commits.nr, + write_graph_chunk_bloom_indexes); + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BLOOMDATA, + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size, + write_graph_chunk_bloom_data); } + if (ctx->num_commit_graphs_after > 1) + chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BASE, + hashsz * (ctx->num_commit_graphs_after - 1), + write_graph_chunk_base); - chunks[num_chunks].id = 0; - chunks[num_chunks].size = 0; + chunkfile_push_chunk(&cf, 0, 0, NULL); hashwrite_be32(f, GRAPH_SIGNATURE); hashwrite_u8(f, GRAPH_VERSION); hashwrite_u8(f, oid_version()); - hashwrite_u8(f, num_chunks); + hashwrite_u8(f, cf.chunks_nr - 1); hashwrite_u8(f, ctx->num_commit_graphs_after - 1); - write_table_of_contents(f, /* cur_offset */ 8, chunks, num_chunks); - - if (ctx->report_progress) { - strbuf_addf(&progress_title, - Q_("Writing out commit graph in %d pass", - "Writing out commit graph in %d passes", - num_chunks), - num_chunks); - ctx->progress = start_delayed_progress( - progress_title.buf, - num_chunks * ctx->commits.nr); - } - - for (i = 0; i < num_chunks; i++) { - uint64_t start_offset = f->total + f->offset; - - if (chunks[i].write_fn(f, ctx)) - return -1; - - if (f->total + f->offset != start_offset + chunks[i].size) - BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", - chunks[i].size, chunks[i].id, - f->total + f->offset - start_offset); - } - - stop_progress(&ctx->progress); - strbuf_release(&progress_title); + chunkfile_write_toc(&cf); + chunkfile_write_chunks(&cf, ctx); if (ctx->split && ctx->base_graph_name && ctx->num_commit_graphs_after > 1) { char *new_base_hash = xstrdup(oid_to_hex(&ctx->new_base_graph->oid)); diff --git a/midx.c b/midx.c index 0548266bea..6315ea7555 100644 --- a/midx.c +++ b/midx.c @@ -807,7 +807,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * struct lock_file lk; struct write_midx_context ctx = { 0 }; uint64_t header_size = 0; - struct chunk_info chunks[MIDX_MAX_CHUNKS]; + struct chunkfile cf; int pack_name_concat_len = 0; int dropped_packs = 0; int result = 0; @@ -933,33 +933,27 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); - chunks[0].id = MIDX_CHUNKID_PACKNAMES; - chunks[0].size = pack_name_concat_len; - chunks[0].write_fn = write_midx_pack_names; + chunkfile_init(&cf, f); - chunks[1].id = MIDX_CHUNKID_OIDFANOUT; - chunks[1].size = MIDX_CHUNK_FANOUT_SIZE; - chunks[1].write_fn = write_midx_oid_fanout; - - chunks[2].id = MIDX_CHUNKID_OIDLOOKUP; - chunks[2].size = ctx.entries_nr * the_hash_algo->rawsz; - chunks[2].write_fn = write_midx_oid_lookup; - - chunks[3].id = MIDX_CHUNKID_OBJECTOFFSETS; - chunks[3].size = ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH; - chunks[3].write_fn = write_midx_object_offsets; + chunkfile_push_chunk(&cf, MIDX_CHUNKID_PACKNAMES, + pack_name_concat_len, write_midx_pack_names); + chunkfile_push_chunk(&cf, MIDX_CHUNKID_OIDFANOUT, + MIDX_CHUNK_FANOUT_SIZE, write_midx_oid_fanout); + chunkfile_push_chunk(&cf, MIDX_CHUNKID_OIDLOOKUP, + ctx.entries_nr * the_hash_algo->rawsz, write_midx_oid_lookup); + chunkfile_push_chunk(&cf, MIDX_CHUNKID_OBJECTOFFSETS, + ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH, write_midx_object_offsets); if (ctx.large_offsets_needed) { - chunks[4].id = MIDX_CHUNKID_LARGEOFFSETS; - chunks[4].size = ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH; - chunks[4].write_fn = write_midx_large_offsets; + chunkfile_push_chunk(&cf, MIDX_CHUNKID_LARGEOFFSETS, + ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH, + write_midx_large_offsets); } - write_table_of_contents(f, header_size, chunks, num_chunks); - - for (i = 0; i < num_chunks; i++) - chunks[i].write_fn(f, &ctx); + chunkfile_write_toc(&cf); + chunkfile_write_chunks(&cf, &ctx); + /* maybe move this into chunkfile??? */ finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM); commit_lock_file(&lk);