Re: [PATCH v4 2/3] repack: add --filter=<filter-spec> option

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Dec 21, 2022 at 05:04:45AM +0100, Christian Couder wrote:
> From: Christian Couder <chriscool@xxxxxxxxxxxxx>
> 
> After cloning with --filter=<filter-spec>, for example to avoid
> getting unneeded large files on a user machine, it's possible
> that some of these large files still get fetched for some reasons
> (like checking out old branches) over time.
> 
> In this case the repo size could grow too much for no good reason
> and `git repack --filter=<filter-spec>` would be useful to remove
> the unneeded large files.
> 
> This command could be dangerous to use though, as it might remove
> local objects that haven't been pushed which would lose data and
> corrupt the repo. On a server, this command could also corrupt a
> repo unless ALL the removed objects aren't already available in
> another remote that clients can access.
> 
> To mitigate that risk, we check that a promisor remote has at
> least been configured.

While this is a nice safeguard, I wonder whether it is sufficient.
Suppose you for example have a non-bare repository that already has
blobs checked out that would become removed by the filtering repack --
does Git handle this situation gracefully?

A quick check seems to indicate that it does. But not quite as well as
I'd have hoped: when I switch to a detached HEAD with an arbitrary
commit and then execute `git repack --filter=blob:none` then it also
removes blobs that are referenced by the currently checked-out commit.
This may or may not be what the user is asking for, but I'd rather lean
towards this behaviour being surprising.

Patrick

> Signed-off-by: John Cai <johncai86@xxxxxxxxx>
> Signed-off-by: Christian Couder <chriscool@xxxxxxxxxxxxx>
> ---
>  Documentation/git-repack.txt |  8 ++++++++
>  builtin/repack.c             | 28 +++++++++++++++++++++-------
>  t/t7700-repack.sh            | 15 +++++++++++++++
>  3 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
> index 4017157949..2539ee0a02 100644
> --- a/Documentation/git-repack.txt
> +++ b/Documentation/git-repack.txt
> @@ -143,6 +143,14 @@ depth is 4095.
>  	a larger and slower repository; see the discussion in
>  	`pack.packSizeLimit`.
>  
> +--filter=<filter-spec>::
> +	Omits certain objects (usually blobs) from the resulting
> +	packfile. WARNING: this could easily corrupt the current repo
> +	and lose data if ANY of the omitted objects hasn't been already
> +	pushed to a remote. Be very careful about objects that might
> +	have been created locally! See linkgit:git-rev-list[1] for valid
> +	`<filter-spec>` forms.
> +
>  -b::
>  --write-bitmap-index::
>  	Write a reachability bitmap index as part of the repack. This
> diff --git a/builtin/repack.c b/builtin/repack.c
> index c1402ad038..8e5ac9c171 100644
> --- a/builtin/repack.c
> +++ b/builtin/repack.c
> @@ -49,6 +49,7 @@ struct pack_objects_args {
>  	const char *depth;
>  	const char *threads;
>  	const char *max_pack_size;
> +	const char *filter;
>  	int no_reuse_delta;
>  	int no_reuse_object;
>  	int quiet;
> @@ -163,6 +164,8 @@ static void prepare_pack_objects(struct child_process *cmd,
>  		strvec_pushf(&cmd->args, "--threads=%s", args->threads);
>  	if (args->max_pack_size)
>  		strvec_pushf(&cmd->args, "--max-pack-size=%s", args->max_pack_size);
> +	if (args->filter)
> +		strvec_pushf(&cmd->args, "--filter=%s", args->filter);
>  	if (args->no_reuse_delta)
>  		strvec_pushf(&cmd->args, "--no-reuse-delta");
>  	if (args->no_reuse_object)
> @@ -234,6 +237,13 @@ static struct generated_pack_data *populate_pack_exts(const char *name)
>  	return data;
>  }
>  
> +static void write_promisor_file_1(char *p)
> +{
> +	char *promisor_name = mkpathdup("%s-%s.promisor", packtmp, p);
> +	write_promisor_file(promisor_name, NULL, 0);
> +	free(promisor_name);
> +}
> +
>  static void repack_promisor_objects(const struct pack_objects_args *args,
>  				    struct string_list *names)
>  {
> @@ -265,7 +275,6 @@ static void repack_promisor_objects(const struct pack_objects_args *args,
>  	out = xfdopen(cmd.out, "r");
>  	while (strbuf_getline_lf(&line, out) != EOF) {
>  		struct string_list_item *item;
> -		char *promisor_name;
>  
>  		if (line.len != the_hash_algo->hexsz)
>  			die(_("repack: Expecting full hex object ID lines only from pack-objects."));
> @@ -282,13 +291,8 @@ static void repack_promisor_objects(const struct pack_objects_args *args,
>  		 * concatenate the contents of all .promisor files instead of
>  		 * just creating a new empty file.
>  		 */
> -		promisor_name = mkpathdup("%s-%s.promisor", packtmp,
> -					  line.buf);
> -		write_promisor_file(promisor_name, NULL, 0);
> -
> +		write_promisor_file_1(line.buf);
>  		item->util = populate_pack_exts(item->string);
> -
> -		free(promisor_name);
>  	}
>  	fclose(out);
>  	if (finish_command(&cmd))
> @@ -800,6 +804,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  				N_("limits the maximum number of threads")),
>  		OPT_STRING(0, "max-pack-size", &po_args.max_pack_size, N_("bytes"),
>  				N_("maximum size of each packfile")),
> +		OPT_STRING(0, "filter", &po_args.filter, N_("args"),
> +				N_("object filtering")),
>  		OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects,
>  				N_("repack objects in packs marked with .keep")),
>  		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
> @@ -834,6 +840,12 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  			die(_("options '%s' and '%s' cannot be used together"), "--cruft", "-k");
>  	}
>  
> +	if (po_args.filter && !has_promisor_remote())
> +		die("a promisor remote must be setup\n"
> +		    "Also please push all the objects "
> +		    "that might be filtered to that remote!\n"
> +		    "Otherwise they will be lost!");
> +

>  	if (write_bitmaps < 0) {
>  		if (!write_midx &&
>  		    (!(pack_everything & ALL_INTO_ONE) || !is_bare_repository()))
> @@ -971,6 +983,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
>  		if (line.len != the_hash_algo->hexsz)
>  			die(_("repack: Expecting full hex object ID lines only from pack-objects."));
>  		item = string_list_append(&names, line.buf);
> +		if (po_args.filter)
> +			write_promisor_file_1(line.buf);
>  		item->util = populate_pack_exts(item->string);
>  	}
>  	strbuf_release(&line);
> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
> index 4aabe98139..3a6ad9f623 100755
> --- a/t/t7700-repack.sh
> +++ b/t/t7700-repack.sh
> @@ -253,6 +253,21 @@ test_expect_success 'auto-bitmaps do not complain if unavailable' '
>  	test_must_be_empty actual
>  '
>  
> +test_expect_success 'repacking with a filter works' '
> +	test_when_finished "rm -rf server client" &&
> +	test_create_repo server &&
> +	git -C server config uploadpack.allowFilter true &&
> +	git -C server config uploadpack.allowAnySHA1InWant true &&
> +	test_commit -C server 1 &&
> +	git clone --bare --no-local server client &&
> +	git -C client config remote.origin.promisor true &&
> +	git -C client rev-list --objects --all --missing=print >objects &&
> +	test $(grep -c "^?" objects) = 0 &&
> +	git -C client -c repack.writebitmaps=false repack -a -d --filter=blob:none &&
> +	git -C client rev-list --objects --all --missing=print >objects &&
> +	test $(grep -c "^?" objects) = 1
> +'
> +
>  objdir=.git/objects
>  midx=$objdir/pack/multi-pack-index
>  
> -- 
> 2.39.0.59.g395bcb85bc.dirty
> 

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux