On Mon, Oct 23, 2023 at 06:44:49PM -0400, Taylor Blau wrote: > (Rebased onto the current tip of 'master', which is ceadf0f3cf (The > twentieth batch, 2023-10-20)). > > This series implements support for a new merge-tree option, > `--write-pack`, which causes any newly-written objects to be packed > together instead of being stored individually as loose. > > This is a minor follow-up that could be taken instead of v4 (though the > changes between these two most recent rounds are stylistic and a matter > of subjective opinion). > > This moves us the bulk_checkin_source structure introduced in response > to Junio's suggestion during the last round further in the OOP > direction. Instead of switching on the enum type of the source, have > function pointers for read() and seek() respectively. > > The functionality at the end is the same, but this avoids some of the > namespacing issues that Peff pointed out while looking at v4. But I > think that this approach ended up being less heavy-weight than I had > originally imagined, so I think that this version is a worthwhile > improvement over v4. > > Beyond that, the changes since last time can be viewed in the range-diff > below. Thanks in advance for any review! Overall this version looks good to me. I've only got two smallish nits and one question regarding the tests. Thanks! Patrick > [1]: https://lore.kernel.org/git/xmqq34y7plj4.fsf@gitster.g/ > > Taylor Blau (5): > bulk-checkin: extract abstract `bulk_checkin_source` > bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types > bulk-checkin: introduce `index_blob_bulk_checkin_incore()` > bulk-checkin: introduce `index_tree_bulk_checkin_incore()` > builtin/merge-tree.c: implement support for `--write-pack` > > Documentation/git-merge-tree.txt | 4 + > builtin/merge-tree.c | 5 + > bulk-checkin.c | 197 +++++++++++++++++++++++++++---- > bulk-checkin.h | 8 ++ > merge-ort.c | 42 +++++-- > merge-recursive.h | 1 + > t/t4301-merge-tree-write-tree.sh | 93 +++++++++++++++ > 7 files changed, 316 insertions(+), 34 deletions(-) > > Range-diff against v4: > 1: 97bb6e9f59 ! 1: 696aa027e4 bulk-checkin: extract abstract `bulk_checkin_source` > @@ bulk-checkin.c: static int already_written(struct bulk_checkin_packfile *state, > } > > +struct bulk_checkin_source { > -+ enum { SOURCE_FILE } type; > ++ off_t (*read)(struct bulk_checkin_source *, void *, size_t); > ++ off_t (*seek)(struct bulk_checkin_source *, off_t); > + > -+ /* SOURCE_FILE fields */ > -+ int fd; > ++ union { > ++ struct { > ++ int fd; > ++ } from_fd; > ++ } data; > + > -+ /* common fields */ > + size_t size; > + const char *path; > +}; > + > -+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source, > -+ off_t offset) > ++static off_t bulk_checkin_source_read_from_fd(struct bulk_checkin_source *source, > ++ void *buf, size_t nr) > +{ > -+ switch (source->type) { > -+ case SOURCE_FILE: > -+ return lseek(source->fd, offset, SEEK_SET); > -+ default: > -+ BUG("unknown bulk-checkin source: %d", source->type); > -+ } > ++ return read_in_full(source->data.from_fd.fd, buf, nr); > +} > + > -+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source, > -+ void *buf, size_t nr) > ++static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source, > ++ off_t offset) > +{ > -+ switch (source->type) { > -+ case SOURCE_FILE: > -+ return read_in_full(source->fd, buf, nr); > -+ default: > -+ BUG("unknown bulk-checkin source: %d", source->type); > -+ } > ++ return lseek(source->data.from_fd.fd, offset, SEEK_SET); > ++} > ++ > ++static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source, > ++ int fd, size_t size, > ++ const char *path) > ++{ > ++ memset(source, 0, sizeof(struct bulk_checkin_source)); > ++ > ++ source->read = bulk_checkin_source_read_from_fd; > ++ source->seek = bulk_checkin_source_seek_from_fd; > ++ > ++ source->data.from_fd.fd = fd; > ++ > ++ source->size = size; > ++ source->path = path; > +} > + > /* > @@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *sta > - ssize_t read_result = read_in_full(fd, ibuf, rsize); > + ssize_t read_result; > + > -+ read_result = bulk_checkin_source_read(source, ibuf, > -+ rsize); > ++ read_result = source->read(source, ibuf, rsize); > if (read_result < 0) > - die_errno("failed to read from '%s'", path); > + die_errno("failed to read from '%s'", > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st > unsigned header_len; > struct hashfile_checkpoint checkpoint = {0}; > struct pack_idx_entry *idx = NULL; > -+ struct bulk_checkin_source source = { > -+ .type = SOURCE_FILE, > -+ .fd = fd, > -+ .size = size, > -+ .path = path, > -+ }; > ++ struct bulk_checkin_source source; > ++ > ++ init_bulk_checkin_source_from_fd(&source, fd, size, path); > > seekback = lseek(fd, 0, SEEK_CUR); > if (seekback == (off_t) -1) > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st > state->offset = checkpoint.offset; > flush_bulk_checkin_packfile(state); > - if (lseek(fd, seekback, SEEK_SET) == (off_t) -1) > -+ if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1) > ++ if (source.seek(&source, seekback) == (off_t)-1) > return error("cannot seek back"); > } > the_hash_algo->final_oid_fn(result_oid, &ctx); > 2: 9d633df339 < -: ---------- bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types > 3: d5bbd7810e ! 2: 596bd028a7 bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source` > @@ Metadata > Author: Taylor Blau <me@xxxxxxxxxxxx> > > ## Commit message ## > - bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source` > + bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types > > - Prepare for a future change where we will want to use a routine very > - similar to the existing `deflate_blob_to_pack()` but over arbitrary > - sources (i.e. either open file-descriptors, or a location in memory). > + The existing `stream_blob_to_pack()` function is named based on the fact > + that it knows only how to stream blobs into a bulk-checkin pack. > > - Extract out a common "deflate_obj_to_pack()" routine that acts on a > - bulk_checkin_source, instead of a (int, size_t) pair. Then rewrite > - `deflate_blob_to_pack()` in terms of it. > + But there is no longer anything in this function which prevents us from > + writing objects of arbitrary types to the bulk-checkin pack. Prepare to > + write OBJ_TREEs by removing this assumption, adding an `enum > + object_type` parameter to this function's argument list, and renaming it > + to `stream_obj_to_pack()` accordingly. > > Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx> > > ## bulk-checkin.c ## > +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source, > + * status before calling us just in case we ask it to call us again > + * with a new pack. > + */ > +-static int stream_blob_to_pack(struct bulk_checkin_packfile *state, > +- git_hash_ctx *ctx, off_t *already_hashed_to, > +- struct bulk_checkin_source *source, > +- unsigned flags) > ++static int stream_obj_to_pack(struct bulk_checkin_packfile *state, > ++ git_hash_ctx *ctx, off_t *already_hashed_to, > ++ struct bulk_checkin_source *source, > ++ enum object_type type, unsigned flags) > + { > + git_zstream s; > + unsigned char ibuf[16384]; > +@@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *state, > + > + git_deflate_init(&s, pack_compression_level); > + > +- hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, > +- size); > ++ hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size); > + s.next_out = obuf + hdrlen; > + s.avail_out = sizeof(obuf) - hdrlen; > + > @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *state, > die_errno("unable to write pack header"); > } > @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *stat > unsigned header_len; > struct hashfile_checkpoint checkpoint = {0}; > struct pack_idx_entry *idx = NULL; > -- struct bulk_checkin_source source = { > -- .type = SOURCE_FILE, > -- .fd = fd, > -- .size = size, > -- .path = path, > -- }; > +- struct bulk_checkin_source source; > > +- init_bulk_checkin_source_from_fd(&source, fd, size, path); > +- > - seekback = lseek(fd, 0, SEEK_CUR); > - if (seekback == (off_t) -1) > - return error("cannot find the current offset"); > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st > prepare_to_stream(state, flags); > if (idx) { > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state, > + idx->offset = state->offset; > crc32_begin(state->f); > } > - if (!stream_obj_to_pack(state, &ctx, &already_hashed_to, > -- &source, OBJ_BLOB, flags)) > +- if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, > +- &source, flags)) > ++ if (!stream_obj_to_pack(state, &ctx, &already_hashed_to, > + source, type, flags)) > break; > /* > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st > hashfile_truncate(state->f, &checkpoint); > state->offset = checkpoint.offset; > flush_bulk_checkin_packfile(state); > -- if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1) > -+ if (bulk_checkin_source_seek_to(source, seekback) == (off_t)-1) > +- if (source.seek(&source, seekback) == (off_t)-1) > ++ if (source->seek(source, seekback) == (off_t)-1) > return error("cannot seek back"); > } > the_hash_algo->final_oid_fn(result_oid, &ctx); > @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st > + int fd, size_t size, > + const char *path, unsigned flags) > +{ > -+ struct bulk_checkin_source source = { > -+ .type = SOURCE_FILE, > -+ .fd = fd, > -+ .size = size, > -+ .path = path, > -+ }; > -+ off_t seekback = lseek(fd, 0, SEEK_CUR); > ++ struct bulk_checkin_source source; > ++ off_t seekback; > ++ > ++ init_bulk_checkin_source_from_fd(&source, fd, size, path); > ++ > ++ seekback = lseek(fd, 0, SEEK_CUR); > + if (seekback == (off_t) -1) > + return error("cannot find the current offset"); > + > 4: e427fe6ad3 < -: ---------- bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source` > 5: 48095afe80 ! 3: d8cf8e4395 bulk-checkin: introduce `index_blob_bulk_checkin_incore()` > @@ Metadata > ## Commit message ## > bulk-checkin: introduce `index_blob_bulk_checkin_incore()` > > - Now that we have factored out many of the common routines necessary to > - index a new object into a pack created by the bulk-checkin machinery, we > - can introduce a variant of `index_blob_bulk_checkin()` that acts on > - blobs whose contents we can fit in memory. > + Introduce `index_blob_bulk_checkin_incore()` which allows streaming > + arbitrary blob contents from memory into the bulk-checkin pack. > + > + In order to support streaming from a location in memory, we must > + implement a new kind of bulk_checkin_source that does just that. These > + implementation in spread out across: > + > + - init_bulk_checkin_source_incore() > + - bulk_checkin_source_read_incore() > + - bulk_checkin_source_seek_incore() > + > + Note that, unlike file descriptors, which manage their own offset > + internally, we have to keep track of how many bytes we've read out of > + the buffer, and make sure we don't read past the end of the buffer. > > This will be useful in a couple of more commits in order to provide the > `merge-tree` builtin with a mechanism to create a new pack containing > @@ Commit message > Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx> > > ## bulk-checkin.c ## > +@@ bulk-checkin.c: struct bulk_checkin_source { > + struct { > + int fd; > + } from_fd; > ++ struct { > ++ const void *buf; > ++ size_t nr_read; > ++ } incore; > + } data; > + > + size_t size; > +@@ bulk-checkin.c: static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source > + return lseek(source->data.from_fd.fd, offset, SEEK_SET); > + } > + > ++static off_t bulk_checkin_source_read_incore(struct bulk_checkin_source *source, > ++ void *buf, size_t nr) > ++{ > ++ const unsigned char *src = source->data.incore.buf; > ++ > ++ if (source->data.incore.nr_read > source->size) > ++ BUG("read beyond bulk-checkin source buffer end " > ++ "(%"PRIuMAX" > %"PRIuMAX")", > ++ (uintmax_t)source->data.incore.nr_read, > ++ (uintmax_t)source->size); > ++ > ++ if (nr > source->size - source->data.incore.nr_read) > ++ nr = source->size - source->data.incore.nr_read; > ++ > ++ src += source->data.incore.nr_read; > ++ > ++ memcpy(buf, src, nr); > ++ source->data.incore.nr_read += nr; > ++ return nr; > ++} > ++ > ++static off_t bulk_checkin_source_seek_incore(struct bulk_checkin_source *source, > ++ off_t offset) > ++{ > ++ if (!(0 <= offset && offset < source->size)) > ++ return (off_t)-1; > ++ source->data.incore.nr_read = offset; > ++ return source->data.incore.nr_read; > ++} > ++ > + static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source, > + int fd, size_t size, > + const char *path) > +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source, > + source->path = path; > + } > + > ++static void init_bulk_checkin_source_incore(struct bulk_checkin_source *source, > ++ const void *buf, size_t size, > ++ const char *path) > ++{ > ++ memset(source, 0, sizeof(struct bulk_checkin_source)); > ++ > ++ source->read = bulk_checkin_source_read_incore; > ++ source->seek = bulk_checkin_source_seek_incore; > ++ > ++ source->data.incore.buf = buf; > ++ source->data.incore.nr_read = 0; > ++ > ++ source->size = size; > ++ source->path = path; > ++} > ++ > + /* > + * Read the contents from 'source' for 'size' bytes, streaming it to the > + * packfile in state while updating the hash in ctx. Signal a failure > @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *state, > return 0; > } > @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *sta > + const char *path, enum object_type type, > + unsigned flags) > +{ > -+ struct bulk_checkin_source source = { > -+ .type = SOURCE_INCORE, > -+ .buf = buf, > -+ .size = size, > -+ .read = 0, > -+ .path = path, > -+ }; > ++ struct bulk_checkin_source source; > ++ > ++ init_bulk_checkin_source_incore(&source, buf, size, path); > + > + return deflate_obj_to_pack(state, result_oid, &source, type, 0, flags); > +} > 6: 60568f9281 = 4: 2670192802 bulk-checkin: introduce `index_tree_bulk_checkin_incore()` > 7: b9be9df122 ! 5: 3595db76a5 builtin/merge-tree.c: implement support for `--write-pack` > @@ Documentation/git-merge-tree.txt: OPTIONS > > ## builtin/merge-tree.c ## > @@ > - #include "quote.h" > #include "tree.h" > #include "config.h" > + #include "strvec.h" > +#include "bulk-checkin.h" > > static int line_termination = '\n'; > > @@ builtin/merge-tree.c: struct merge_tree_options { > - int show_messages; > int name_only; > int use_stdin; > + struct merge_options merge_options; > + int write_pack; > }; > > static int real_merge(struct merge_tree_options *o, > @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o, > - init_merge_options(&opt, the_repository); > + _("not something we can merge")); > > opt.show_rename_progress = 0; > + opt.write_pack = o->write_pack; > @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o, > opt.branch1 = branch1; > opt.branch2 = branch2; > @@ builtin/merge-tree.c: int cmd_merge_tree(int argc, const char **argv, const char *prefix) > - &merge_base, > - N_("commit"), > N_("specify a merge-base for the merge")), > + OPT_STRVEC('X', "strategy-option", &xopts, N_("option=value"), > + N_("option for selected merge strategy")), > + OPT_BOOL(0, "write-pack", &o.write_pack, > + N_("write new objects to a pack instead of as loose")), > OPT_END() > > base-commit: ceadf0f3cf51550166a387ec8508bb55e7883057 > -- > 2.42.0.425.g963d08ddb3.dirty
Attachment:
signature.asc
Description: PGP signature