[PATCH v5 0/5] merge-ort: implement support for packing objects together

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



(Rebased onto the current tip of 'master', which is ceadf0f3cf (The
twentieth batch, 2023-10-20)).

This series implements support for a new merge-tree option,
`--write-pack`, which causes any newly-written objects to be packed
together instead of being stored individually as loose.

This is a minor follow-up that could be taken instead of v4 (though the
changes between these two most recent rounds are stylistic and a matter
of subjective opinion).

This moves us the bulk_checkin_source structure introduced in response
to Junio's suggestion during the last round further in the OOP
direction. Instead of switching on the enum type of the source, have
function pointers for read() and seek() respectively.

The functionality at the end is the same, but this avoids some of the
namespacing issues that Peff pointed out while looking at v4. But I
think that this approach ended up being less heavy-weight than I had
originally imagined, so I think that this version is a worthwhile
improvement over v4.

Beyond that, the changes since last time can be viewed in the range-diff
below. Thanks in advance for any review!

[1]: https://lore.kernel.org/git/xmqq34y7plj4.fsf@gitster.g/

Taylor Blau (5):
  bulk-checkin: extract abstract `bulk_checkin_source`
  bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
  bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
  bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
  builtin/merge-tree.c: implement support for `--write-pack`

 Documentation/git-merge-tree.txt |   4 +
 builtin/merge-tree.c             |   5 +
 bulk-checkin.c                   | 197 +++++++++++++++++++++++++++----
 bulk-checkin.h                   |   8 ++
 merge-ort.c                      |  42 +++++--
 merge-recursive.h                |   1 +
 t/t4301-merge-tree-write-tree.sh |  93 +++++++++++++++
 7 files changed, 316 insertions(+), 34 deletions(-)

Range-diff against v4:
1:  97bb6e9f59 ! 1:  696aa027e4 bulk-checkin: extract abstract `bulk_checkin_source`
    @@ bulk-checkin.c: static int already_written(struct bulk_checkin_packfile *state,
      }
      
     +struct bulk_checkin_source {
    -+	enum { SOURCE_FILE } type;
    ++	off_t (*read)(struct bulk_checkin_source *, void *, size_t);
    ++	off_t (*seek)(struct bulk_checkin_source *, off_t);
     +
    -+	/* SOURCE_FILE fields */
    -+	int fd;
    ++	union {
    ++		struct {
    ++			int fd;
    ++		} from_fd;
    ++	} data;
     +
    -+	/* common fields */
     +	size_t size;
     +	const char *path;
     +};
     +
    -+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
    -+					 off_t offset)
    ++static off_t bulk_checkin_source_read_from_fd(struct bulk_checkin_source *source,
    ++					      void *buf, size_t nr)
     +{
    -+	switch (source->type) {
    -+	case SOURCE_FILE:
    -+		return lseek(source->fd, offset, SEEK_SET);
    -+	default:
    -+		BUG("unknown bulk-checkin source: %d", source->type);
    -+	}
    ++	return read_in_full(source->data.from_fd.fd, buf, nr);
     +}
     +
    -+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
    -+					void *buf, size_t nr)
    ++static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source,
    ++					      off_t offset)
     +{
    -+	switch (source->type) {
    -+	case SOURCE_FILE:
    -+		return read_in_full(source->fd, buf, nr);
    -+	default:
    -+		BUG("unknown bulk-checkin source: %d", source->type);
    -+	}
    ++	return lseek(source->data.from_fd.fd, offset, SEEK_SET);
    ++}
    ++
    ++static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
    ++					     int fd, size_t size,
    ++					     const char *path)
    ++{
    ++	memset(source, 0, sizeof(struct bulk_checkin_source));
    ++
    ++	source->read = bulk_checkin_source_read_from_fd;
    ++	source->seek = bulk_checkin_source_seek_from_fd;
    ++
    ++	source->data.from_fd.fd = fd;
    ++
    ++	source->size = size;
    ++	source->path = path;
     +}
     +
      /*
    @@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *sta
     -			ssize_t read_result = read_in_full(fd, ibuf, rsize);
     +			ssize_t read_result;
     +
    -+			read_result = bulk_checkin_source_read(source, ibuf,
    -+							       rsize);
    ++			read_result = source->read(source, ibuf, rsize);
      			if (read_result < 0)
     -				die_errno("failed to read from '%s'", path);
     +				die_errno("failed to read from '%s'",
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
      	unsigned header_len;
      	struct hashfile_checkpoint checkpoint = {0};
      	struct pack_idx_entry *idx = NULL;
    -+	struct bulk_checkin_source source = {
    -+		.type = SOURCE_FILE,
    -+		.fd = fd,
    -+		.size = size,
    -+		.path = path,
    -+	};
    ++	struct bulk_checkin_source source;
    ++
    ++	init_bulk_checkin_source_from_fd(&source, fd, size, path);
      
      	seekback = lseek(fd, 0, SEEK_CUR);
      	if (seekback == (off_t) -1)
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
      		state->offset = checkpoint.offset;
      		flush_bulk_checkin_packfile(state);
     -		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
    -+		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
    ++		if (source.seek(&source, seekback) == (off_t)-1)
      			return error("cannot seek back");
      	}
      	the_hash_algo->final_oid_fn(result_oid, &ctx);
2:  9d633df339 < -:  ---------- bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
3:  d5bbd7810e ! 2:  596bd028a7 bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source`
    @@ Metadata
     Author: Taylor Blau <me@xxxxxxxxxxxx>
     
      ## Commit message ##
    -    bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source`
    +    bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
     
    -    Prepare for a future change where we will want to use a routine very
    -    similar to the existing `deflate_blob_to_pack()` but over arbitrary
    -    sources (i.e. either open file-descriptors, or a location in memory).
    +    The existing `stream_blob_to_pack()` function is named based on the fact
    +    that it knows only how to stream blobs into a bulk-checkin pack.
     
    -    Extract out a common "deflate_obj_to_pack()" routine that acts on a
    -    bulk_checkin_source, instead of a (int, size_t) pair. Then rewrite
    -    `deflate_blob_to_pack()` in terms of it.
    +    But there is no longer anything in this function which prevents us from
    +    writing objects of arbitrary types to the bulk-checkin pack. Prepare to
    +    write OBJ_TREEs by removing this assumption, adding an `enum
    +    object_type` parameter to this function's argument list, and renaming it
    +    to `stream_obj_to_pack()` accordingly.
     
         Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
     
      ## bulk-checkin.c ##
    +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
    +  * status before calling us just in case we ask it to call us again
    +  * with a new pack.
    +  */
    +-static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
    +-			       git_hash_ctx *ctx, off_t *already_hashed_to,
    +-			       struct bulk_checkin_source *source,
    +-			       unsigned flags)
    ++static int stream_obj_to_pack(struct bulk_checkin_packfile *state,
    ++			      git_hash_ctx *ctx, off_t *already_hashed_to,
    ++			      struct bulk_checkin_source *source,
    ++			      enum object_type type, unsigned flags)
    + {
    + 	git_zstream s;
    + 	unsigned char ibuf[16384];
    +@@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
    + 
    + 	git_deflate_init(&s, pack_compression_level);
    + 
    +-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
    +-					      size);
    ++	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), type, size);
    + 	s.next_out = obuf + hdrlen;
    + 	s.avail_out = sizeof(obuf) - hdrlen;
    + 
     @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *state,
      		die_errno("unable to write pack header");
      }
    @@ bulk-checkin.c: static void prepare_to_stream(struct bulk_checkin_packfile *stat
      	unsigned header_len;
      	struct hashfile_checkpoint checkpoint = {0};
      	struct pack_idx_entry *idx = NULL;
    --	struct bulk_checkin_source source = {
    --		.type = SOURCE_FILE,
    --		.fd = fd,
    --		.size = size,
    --		.path = path,
    --	};
    +-	struct bulk_checkin_source source;
      
    +-	init_bulk_checkin_source_from_fd(&source, fd, size, path);
    +-
     -	seekback = lseek(fd, 0, SEEK_CUR);
     -	if (seekback == (off_t) -1)
     -		return error("cannot find the current offset");
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
      		prepare_to_stream(state, flags);
      		if (idx) {
     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    + 			idx->offset = state->offset;
      			crc32_begin(state->f);
      		}
    - 		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
    --					&source, OBJ_BLOB, flags))
    +-		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
    +-					 &source, flags))
    ++		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
     +					source, type, flags))
      			break;
      		/*
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
      		hashfile_truncate(state->f, &checkpoint);
      		state->offset = checkpoint.offset;
      		flush_bulk_checkin_packfile(state);
    --		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
    -+		if (bulk_checkin_source_seek_to(source, seekback) == (off_t)-1)
    +-		if (source.seek(&source, seekback) == (off_t)-1)
    ++		if (source->seek(source, seekback) == (off_t)-1)
      			return error("cannot seek back");
      	}
      	the_hash_algo->final_oid_fn(result_oid, &ctx);
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
     +				int fd, size_t size,
     +				const char *path, unsigned flags)
     +{
    -+	struct bulk_checkin_source source = {
    -+		.type = SOURCE_FILE,
    -+		.fd = fd,
    -+		.size = size,
    -+		.path = path,
    -+	};
    -+	off_t seekback = lseek(fd, 0, SEEK_CUR);
    ++	struct bulk_checkin_source source;
    ++	off_t seekback;
    ++
    ++	init_bulk_checkin_source_from_fd(&source, fd, size, path);
    ++
    ++	seekback = lseek(fd, 0, SEEK_CUR);
     +	if (seekback == (off_t) -1)
     +		return error("cannot find the current offset");
     +
4:  e427fe6ad3 < -:  ---------- bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
5:  48095afe80 ! 3:  d8cf8e4395 bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
    @@ Metadata
      ## Commit message ##
         bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
     
    -    Now that we have factored out many of the common routines necessary to
    -    index a new object into a pack created by the bulk-checkin machinery, we
    -    can introduce a variant of `index_blob_bulk_checkin()` that acts on
    -    blobs whose contents we can fit in memory.
    +    Introduce `index_blob_bulk_checkin_incore()` which allows streaming
    +    arbitrary blob contents from memory into the bulk-checkin pack.
    +
    +    In order to support streaming from a location in memory, we must
    +    implement a new kind of bulk_checkin_source that does just that. These
    +    implementation in spread out across:
    +
    +      - init_bulk_checkin_source_incore()
    +      - bulk_checkin_source_read_incore()
    +      - bulk_checkin_source_seek_incore()
    +
    +    Note that, unlike file descriptors, which manage their own offset
    +    internally, we have to keep track of how many bytes we've read out of
    +    the buffer, and make sure we don't read past the end of the buffer.
     
         This will be useful in a couple of more commits in order to provide the
         `merge-tree` builtin with a mechanism to create a new pack containing
    @@ Commit message
         Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
     
      ## bulk-checkin.c ##
    +@@ bulk-checkin.c: struct bulk_checkin_source {
    + 		struct {
    + 			int fd;
    + 		} from_fd;
    ++		struct {
    ++			const void *buf;
    ++			size_t nr_read;
    ++		} incore;
    + 	} data;
    + 
    + 	size_t size;
    +@@ bulk-checkin.c: static off_t bulk_checkin_source_seek_from_fd(struct bulk_checkin_source *source
    + 	return lseek(source->data.from_fd.fd, offset, SEEK_SET);
    + }
    + 
    ++static off_t bulk_checkin_source_read_incore(struct bulk_checkin_source *source,
    ++					     void *buf, size_t nr)
    ++{
    ++	const unsigned char *src = source->data.incore.buf;
    ++
    ++	if (source->data.incore.nr_read > source->size)
    ++		BUG("read beyond bulk-checkin source buffer end "
    ++		    "(%"PRIuMAX" > %"PRIuMAX")",
    ++		    (uintmax_t)source->data.incore.nr_read,
    ++		    (uintmax_t)source->size);
    ++
    ++	if (nr > source->size - source->data.incore.nr_read)
    ++		nr = source->size - source->data.incore.nr_read;
    ++
    ++	src += source->data.incore.nr_read;
    ++
    ++	memcpy(buf, src, nr);
    ++	source->data.incore.nr_read += nr;
    ++	return nr;
    ++}
    ++
    ++static off_t bulk_checkin_source_seek_incore(struct bulk_checkin_source *source,
    ++					     off_t offset)
    ++{
    ++	if (!(0 <= offset && offset < source->size))
    ++		return (off_t)-1;
    ++	source->data.incore.nr_read = offset;
    ++	return source->data.incore.nr_read;
    ++}
    ++
    + static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
    + 					     int fd, size_t size,
    + 					     const char *path)
    +@@ bulk-checkin.c: static void init_bulk_checkin_source_from_fd(struct bulk_checkin_source *source,
    + 	source->path = path;
    + }
    + 
    ++static void init_bulk_checkin_source_incore(struct bulk_checkin_source *source,
    ++					    const void *buf, size_t size,
    ++					    const char *path)
    ++{
    ++	memset(source, 0, sizeof(struct bulk_checkin_source));
    ++
    ++	source->read = bulk_checkin_source_read_incore;
    ++	source->seek = bulk_checkin_source_seek_incore;
    ++
    ++	source->data.incore.buf = buf;
    ++	source->data.incore.nr_read = 0;
    ++
    ++	source->size = size;
    ++	source->path = path;
    ++}
    ++
    + /*
    +  * Read the contents from 'source' for 'size' bytes, streaming it to the
    +  * packfile in state while updating the hash in ctx. Signal a failure
     @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *state,
      	return 0;
      }
    @@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *sta
     +				       const char *path, enum object_type type,
     +				       unsigned flags)
     +{
    -+	struct bulk_checkin_source source = {
    -+		.type = SOURCE_INCORE,
    -+		.buf = buf,
    -+		.size = size,
    -+		.read = 0,
    -+		.path = path,
    -+	};
    ++	struct bulk_checkin_source source;
    ++
    ++	init_bulk_checkin_source_incore(&source, buf, size, path);
     +
     +	return deflate_obj_to_pack(state, result_oid, &source, type, 0, flags);
     +}
6:  60568f9281 = 4:  2670192802 bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
7:  b9be9df122 ! 5:  3595db76a5 builtin/merge-tree.c: implement support for `--write-pack`
    @@ Documentation/git-merge-tree.txt: OPTIONS
     
      ## builtin/merge-tree.c ##
     @@
    - #include "quote.h"
      #include "tree.h"
      #include "config.h"
    + #include "strvec.h"
     +#include "bulk-checkin.h"
      
      static int line_termination = '\n';
      
     @@ builtin/merge-tree.c: struct merge_tree_options {
    - 	int show_messages;
      	int name_only;
      	int use_stdin;
    + 	struct merge_options merge_options;
     +	int write_pack;
      };
      
      static int real_merge(struct merge_tree_options *o,
     @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o,
    - 	init_merge_options(&opt, the_repository);
    + 				 _("not something we can merge"));
      
      	opt.show_rename_progress = 0;
     +	opt.write_pack = o->write_pack;
    @@ builtin/merge-tree.c: static int real_merge(struct merge_tree_options *o,
      	opt.branch1 = branch1;
      	opt.branch2 = branch2;
     @@ builtin/merge-tree.c: int cmd_merge_tree(int argc, const char **argv, const char *prefix)
    - 			   &merge_base,
    - 			   N_("commit"),
      			   N_("specify a merge-base for the merge")),
    + 		OPT_STRVEC('X', "strategy-option", &xopts, N_("option=value"),
    + 			N_("option for selected merge strategy")),
     +		OPT_BOOL(0, "write-pack", &o.write_pack,
     +			 N_("write new objects to a pack instead of as loose")),
      		OPT_END()

base-commit: ceadf0f3cf51550166a387ec8508bb55e7883057
-- 
2.42.0.425.g963d08ddb3.dirty




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux