Re: [PATCH v5 5/5] builtin/merge-tree.c: implement support for `--write-pack`

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Oct 23, 2023 at 06:45:06PM -0400, Taylor Blau wrote:
> When using merge-tree often within a repository[^1], it is possible to
> generate a relatively large number of loose objects, which can result in
> degraded performance, and inode exhaustion in extreme cases.
> 
> Building on the functionality introduced in previous commits, the
> bulk-checkin machinery now has support to write arbitrary blob and tree
> objects which are small enough to be held in-core. We can use this to
> write any blob/tree objects generated by ORT into a separate pack
> instead of writing them out individually as loose.
> 
> This functionality is gated behind a new `--write-pack` option to
> `merge-tree` that works with the (non-deprecated) `--write-tree` mode.
> 
> The implementation is relatively straightforward. There are two spots
> within the ORT mechanism where we call `write_object_file()`, one for
> content differences within blobs, and another to assemble any new trees
> necessary to construct the merge. In each of those locations,
> conditionally replace calls to `write_object_file()` with
> `index_blob_bulk_checkin_incore()` or `index_tree_bulk_checkin_incore()`
> depending on which kind of object we are writing.
> 
> The only remaining task is to begin and end the transaction necessary to
> initialize the bulk-checkin machinery, and move any new pack(s) it
> created into the main object store.
> 
> [^1]: Such is the case at GitHub, where we run presumptive "test merges"
>   on open pull requests to see whether or not we can light up the merge
>   button green depending on whether or not the presumptive merge was
>   conflicted.
> 
>   This is done in response to a number of user-initiated events,
>   including viewing an open pull request whose last test merge is stale
>   with respect to the current base and tip of the pull request. As a
>   result, merge-tree can be run very frequently on large, active
>   repositories.
> 
> Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
> ---
>  Documentation/git-merge-tree.txt |  4 ++
>  builtin/merge-tree.c             |  5 ++
>  merge-ort.c                      | 42 +++++++++++----
>  merge-recursive.h                |  1 +
>  t/t4301-merge-tree-write-tree.sh | 93 ++++++++++++++++++++++++++++++++
>  5 files changed, 136 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/git-merge-tree.txt b/Documentation/git-merge-tree.txt
> index ffc4fbf7e8..9d37609ef1 100644
> --- a/Documentation/git-merge-tree.txt
> +++ b/Documentation/git-merge-tree.txt
> @@ -69,6 +69,10 @@ OPTIONS
>  	specify a merge-base for the merge, and specifying multiple bases is
>  	currently not supported. This option is incompatible with `--stdin`.
>  
> +--write-pack::
> +	Write any new objects into a separate packfile instead of as
> +	individual loose objects.
> +
>  [[OUTPUT]]
>  OUTPUT
>  ------
> diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c
> index a35e0452d6..218442ac9b 100644
> --- a/builtin/merge-tree.c
> +++ b/builtin/merge-tree.c
> @@ -19,6 +19,7 @@
>  #include "tree.h"
>  #include "config.h"
>  #include "strvec.h"
> +#include "bulk-checkin.h"
>  
>  static int line_termination = '\n';
>  
> @@ -416,6 +417,7 @@ struct merge_tree_options {
>  	int name_only;
>  	int use_stdin;
>  	struct merge_options merge_options;
> +	int write_pack;
>  };
>  
>  static int real_merge(struct merge_tree_options *o,
> @@ -441,6 +443,7 @@ static int real_merge(struct merge_tree_options *o,
>  				 _("not something we can merge"));
>  
>  	opt.show_rename_progress = 0;
> +	opt.write_pack = o->write_pack;
>  
>  	opt.branch1 = branch1;
>  	opt.branch2 = branch2;
> @@ -553,6 +556,8 @@ int cmd_merge_tree(int argc, const char **argv, const char *prefix)
>  			   N_("specify a merge-base for the merge")),
>  		OPT_STRVEC('X', "strategy-option", &xopts, N_("option=value"),
>  			N_("option for selected merge strategy")),
> +		OPT_BOOL(0, "write-pack", &o.write_pack,
> +			 N_("write new objects to a pack instead of as loose")),
>  		OPT_END()
>  	};
>  
> diff --git a/merge-ort.c b/merge-ort.c
> index 3653725661..523577d71e 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -48,6 +48,7 @@
>  #include "tree.h"
>  #include "unpack-trees.h"
>  #include "xdiff-interface.h"
> +#include "bulk-checkin.h"
>  
>  /*
>   * We have many arrays of size 3.  Whenever we have such an array, the
> @@ -2108,10 +2109,19 @@ static int handle_content_merge(struct merge_options *opt,
>  		if ((merge_status < 0) || !result_buf.ptr)
>  			ret = error(_("failed to execute internal merge"));
>  
> -		if (!ret &&
> -		    write_object_file(result_buf.ptr, result_buf.size,
> -				      OBJ_BLOB, &result->oid))
> -			ret = error(_("unable to add %s to database"), path);
> +		if (!ret) {
> +			ret = opt->write_pack
> +				? index_blob_bulk_checkin_incore(&result->oid,
> +								 result_buf.ptr,
> +								 result_buf.size,
> +								 path, 1)
> +				: write_object_file(result_buf.ptr,
> +						    result_buf.size,
> +						    OBJ_BLOB, &result->oid);
> +			if (ret)
> +				ret = error(_("unable to add %s to database"),
> +					    path);
> +		}
>  
>  		free(result_buf.ptr);
>  		if (ret)
> @@ -3597,7 +3607,8 @@ static int tree_entry_order(const void *a_, const void *b_)
>  				 b->string, strlen(b->string), bmi->result.mode);
>  }
>  
> -static int write_tree(struct object_id *result_oid,
> +static int write_tree(struct merge_options *opt,
> +		      struct object_id *result_oid,
>  		      struct string_list *versions,
>  		      unsigned int offset,
>  		      size_t hash_size)
> @@ -3631,8 +3642,14 @@ static int write_tree(struct object_id *result_oid,
>  	}
>  
>  	/* Write this object file out, and record in result_oid */
> -	if (write_object_file(buf.buf, buf.len, OBJ_TREE, result_oid))
> +	ret = opt->write_pack
> +		? index_tree_bulk_checkin_incore(result_oid,
> +						 buf.buf, buf.len, "", 1)
> +		: write_object_file(buf.buf, buf.len, OBJ_TREE, result_oid);
> +
> +	if (ret)
>  		ret = -1;
> +
>  	strbuf_release(&buf);
>  	return ret;
>  }
> @@ -3797,8 +3814,8 @@ static int write_completed_directory(struct merge_options *opt,
>  		 */
>  		dir_info->is_null = 0;
>  		dir_info->result.mode = S_IFDIR;
> -		if (write_tree(&dir_info->result.oid, &info->versions, offset,
> -			       opt->repo->hash_algo->rawsz) < 0)
> +		if (write_tree(opt, &dir_info->result.oid, &info->versions,
> +			       offset, opt->repo->hash_algo->rawsz) < 0)
>  			ret = -1;
>  	}
>  
> @@ -4332,9 +4349,13 @@ static int process_entries(struct merge_options *opt,
>  		fflush(stdout);
>  		BUG("dir_metadata accounting completely off; shouldn't happen");
>  	}
> -	if (write_tree(result_oid, &dir_metadata.versions, 0,
> +	if (write_tree(opt, result_oid, &dir_metadata.versions, 0,
>  		       opt->repo->hash_algo->rawsz) < 0)
>  		ret = -1;
> +
> +	if (opt->write_pack)
> +		end_odb_transaction();
> +
>  cleanup:
>  	string_list_clear(&plist, 0);
>  	string_list_clear(&dir_metadata.versions, 0);
> @@ -4878,6 +4899,9 @@ static void merge_start(struct merge_options *opt, struct merge_result *result)
>  	 */
>  	strmap_init(&opt->priv->conflicts);
>  
> +	if (opt->write_pack)
> +		begin_odb_transaction();
> +
>  	trace2_region_leave("merge", "allocate/init", opt->repo);
>  }
>  
> diff --git a/merge-recursive.h b/merge-recursive.h
> index 3d3b3e3c29..5c5ff380a8 100644
> --- a/merge-recursive.h
> +++ b/merge-recursive.h
> @@ -48,6 +48,7 @@ struct merge_options {
>  	unsigned renormalize : 1;
>  	unsigned record_conflict_msgs_as_headers : 1;
>  	const char *msg_header_prefix;
> +	unsigned write_pack : 1;
>  
>  	/* internal fields used by the implementation */
>  	struct merge_options_internal *priv;
> diff --git a/t/t4301-merge-tree-write-tree.sh b/t/t4301-merge-tree-write-tree.sh
> index b2c8a43fce..d2a8634523 100755
> --- a/t/t4301-merge-tree-write-tree.sh
> +++ b/t/t4301-merge-tree-write-tree.sh
> @@ -945,4 +945,97 @@ test_expect_success 'check the input format when --stdin is passed' '
>  	test_cmp expect actual
>  '
>  
> +packdir=".git/objects/pack"
> +
> +test_expect_success 'merge-tree can pack its result with --write-pack' '
> +	test_when_finished "rm -rf repo" &&
> +	git init repo &&
> +
> +	# base has lines [3, 4, 5]
> +	#   - side adds to the beginning, resulting in [1, 2, 3, 4, 5]
> +	#   - other adds to the end, resulting in [3, 4, 5, 6, 7]
> +	#
> +	# merging the two should result in a new blob object containing
> +	# [1, 2, 3, 4, 5, 6, 7], along with a new tree.
> +	test_commit -C repo base file "$(test_seq 3 5)" &&
> +	git -C repo branch -M main &&
> +	git -C repo checkout -b side main &&
> +	test_commit -C repo side file "$(test_seq 1 5)" &&
> +	git -C repo checkout -b other main &&
> +	test_commit -C repo other file "$(test_seq 3 7)" &&
> +
> +	find repo/$packdir -type f -name "pack-*.idx" >packs.before &&
> +	tree="$(git -C repo merge-tree --write-pack \
> +		refs/tags/side refs/tags/other)" &&
> +	blob="$(git -C repo rev-parse $tree:file)" &&
> +	find repo/$packdir -type f -name "pack-*.idx" >packs.after &&

While we do assert that we write a new packfile, we don't assert whether
parts of the written object may have been written as loose objects. Do
we want to tighten the checks to verify that?

Patrick

> +	test_must_be_empty packs.before &&
> +	test_line_count = 1 packs.after &&
> +
> +	git show-index <$(cat packs.after) >objects &&
> +	test_line_count = 2 objects &&
> +	grep "^[1-9][0-9]* $tree" objects &&
> +	grep "^[1-9][0-9]* $blob" objects
> +'
> +
> +test_expect_success 'merge-tree can write multiple packs with --write-pack' '
> +	test_when_finished "rm -rf repo" &&
> +	git init repo &&
> +	(
> +		cd repo &&
> +
> +		git config pack.packSizeLimit 512 &&
> +
> +		test_seq 512 >f &&
> +
> +		# "f" contains roughly ~2,000 bytes.
> +		#
> +		# Each side ("foo" and "bar") adds a small amount of data at the
> +		# beginning and end of "base", respectively.
> +		git add f &&
> +		test_tick &&
> +		git commit -m base &&
> +		git branch -M main &&
> +
> +		git checkout -b foo main &&
> +		{
> +			echo foo && cat f
> +		} >f.tmp &&
> +		mv f.tmp f &&
> +		git add f &&
> +		test_tick &&
> +		git commit -m foo &&
> +
> +		git checkout -b bar main &&
> +		echo bar >>f &&
> +		git add f &&
> +		test_tick &&
> +		git commit -m bar &&
> +
> +		find $packdir -type f -name "pack-*.idx" >packs.before &&
> +		# Merging either side should result in a new object which is
> +		# larger than 1M, thus the result should be split into two
> +		# separate packs.
> +		tree="$(git merge-tree --write-pack \
> +			refs/heads/foo refs/heads/bar)" &&
> +		blob="$(git rev-parse $tree:f)" &&
> +		find $packdir -type f -name "pack-*.idx" >packs.after &&
> +
> +		test_must_be_empty packs.before &&
> +		test_line_count = 2 packs.after &&
> +		for idx in $(cat packs.after)
> +		do
> +			git show-index <$idx || return 1
> +		done >objects &&
> +
> +		# The resulting set of packs should contain one copy of both
> +		# objects, each in a separate pack.
> +		test_line_count = 2 objects &&
> +		grep "^[1-9][0-9]* $tree" objects &&
> +		grep "^[1-9][0-9]* $blob" objects
> +
> +	)
> +'
> +
>  test_done
> -- 
> 2.42.0.425.g963d08ddb3.dirty

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux