On Thu, Sep 07, 2023 at 05:52:04PM -0400, Taylor Blau wrote: [snip] > @@ -125,17 +133,39 @@ static void mark_packs_for_deletion_1(struct string_list *names, > if (len < hexsz) > continue; > sha1 = item->string + len - hexsz; > - /* > - * Mark this pack for deletion, which ensures that this > - * pack won't be included in a MIDX (if `--write-midx` > - * was given) and that we will actually delete this pack > - * (if `-d` was given). > - */ > - if (!string_list_has_string(names, sha1)) > - item->util = (void*)1; > + > + if (pack_is_retained(item)) { > + item->util = NULL; > + } else if (!string_list_has_string(names, sha1)) { > + /* > + * Mark this pack for deletion, which ensures > + * that this pack won't be included in a MIDX > + * (if `--write-midx` was given) and that we > + * will actually delete this pack (if `-d` was > + * given). > + */ > + item->util = DELETE_PACK; > + } I find the behaviour of this function a tad surprising as it doesn't only mark a pack for deletion, but it also marks a pack as not being retained anymore. Shouldn't we rather: if (pack_is_retained(item)) { // Theoretically speaking we shouldn't even do this bit here as // we _un_mark the pack for deletion. But at least we shouldn't // be removing the `RETAIN_PACK` bit, I'd think. item->util &= ~DELETE_PACK; } else if (!string_list_has_string(names, sha1)) { // And here we shouldn't discard the `RETAIN_PACK` bit either. item->util |= DELETE_PACK; } > } > } > > +static void retain_cruft_pack(struct existing_packs *existing, > + struct packed_git *cruft) > +{ > + struct strbuf buf = STRBUF_INIT; > + struct string_list_item *item; > + > + strbuf_addstr(&buf, pack_basename(cruft)); > + strbuf_strip_suffix(&buf, ".pack"); > + > + item = string_list_lookup(&existing->cruft_packs, buf.buf); > + if (!item) > + BUG("could not find cruft pack '%s'", pack_basename(cruft)); > + > + item->util = (void*)RETAIN_PACK; > + strbuf_release(&buf); > +} > + Similarly, should we handle potentially pre-existing bits gracefully and `item->util |= RETAIN_PACK`? > static void mark_packs_for_deletion(struct existing_packs *existing, > struct string_list *names) > > @@ -217,6 +247,8 @@ static void collect_pack_filenames(struct existing_packs *existing, > } > > string_list_sort(&existing->kept_packs); > + string_list_sort(&existing->non_kept_packs); > + string_list_sort(&existing->cruft_packs); > strbuf_release(&buf); > } > > @@ -799,6 +831,72 @@ static void remove_redundant_bitmaps(struct string_list *include, > strbuf_release(&path); > } > > +static int existing_cruft_pack_cmp(const void *va, const void *vb) > +{ > + struct packed_git *a = *(struct packed_git **)va; > + struct packed_git *b = *(struct packed_git **)vb; > + > + if (a->pack_size < b->pack_size) > + return -1; > + if (a->pack_size > b->pack_size) > + return 1; > + return 0; > +} > + > +static void collapse_small_cruft_packs(FILE *in, unsigned long max_size, We might want to use `size_t` to denote file sizes instead of `unsigned long`. > + struct existing_packs *existing) > +{ > + struct packed_git **existing_cruft, *p; > + struct strbuf buf = STRBUF_INIT; > + unsigned long total_size = 0; Here, as well. > + size_t existing_cruft_nr = 0; > + size_t i; > + > + ALLOC_ARRAY(existing_cruft, existing->cruft_packs.nr); > + > + for (p = get_all_packs(the_repository); p; p = p->next) { > + if (!(p->is_cruft && p->pack_local)) > + continue; > + > + strbuf_reset(&buf); > + strbuf_addstr(&buf, pack_basename(p)); > + strbuf_strip_suffix(&buf, ".pack"); > + > + if (!string_list_has_string(&existing->cruft_packs, buf.buf)) > + continue; > + > + if (existing_cruft_nr >= existing->cruft_packs.nr) > + BUG("too many cruft packs (found %"PRIuMAX", but knew " > + "of %"PRIuMAX")", > + (uintmax_t)existing_cruft_nr + 1, > + (uintmax_t)existing->cruft_packs.nr); > + existing_cruft[existing_cruft_nr++] = p; > + } > + > + QSORT(existing_cruft, existing_cruft_nr, existing_cruft_pack_cmp); > + > + for (i = 0; i < existing_cruft_nr; i++) { > + off_t proposed; This should also be `size_t` given that `st_add` returns `size_t` and not `off_t`. > + p = existing_cruft[i]; > + proposed = st_add(total_size, p->pack_size); > + > + if (proposed <= max_size) { > + total_size = proposed; > + fprintf(in, "-%s\n", pack_basename(p)); > + } else { > + retain_cruft_pack(existing, p); > + fprintf(in, "%s\n", pack_basename(p)); > + } It's a bit funny that we re-check whether we have exceeded the maximum size in subsequente iterations once we hit the limit, but it arguably makes the logic a bit simpler. > + } > + > + for (i = 0; i < existing->non_kept_packs.nr; i++) > + fprintf(in, "-%s.pack\n", > + existing->non_kept_packs.items[i].string); > + > + strbuf_release(&buf); > +} > + > static int write_cruft_pack(const struct pack_objects_args *args, > const char *destination, > const char *pack_prefix, > @@ -846,10 +944,18 @@ static int write_cruft_pack(const struct pack_objects_args *args, > in = xfdopen(cmd.in, "w"); > for_each_string_list_item(item, names) > fprintf(in, "%s-%s.pack\n", pack_prefix, item->string); > - for_each_string_list_item(item, &existing->non_kept_packs) > - fprintf(in, "-%s.pack\n", item->string); > - for_each_string_list_item(item, &existing->cruft_packs) > - fprintf(in, "-%s.pack\n", item->string); > + if (args->max_pack_size && !cruft_expiration) { > + unsigned long max_pack_size; > + if (!git_parse_ulong(args->max_pack_size, &max_pack_size)) > + return error(_("could not parse --cruft-max-size: '%s'"), > + args->max_pack_size); > + collapse_small_cruft_packs(in, max_pack_size, existing); > + } else { > + for_each_string_list_item(item, &existing->non_kept_packs) > + fprintf(in, "-%s.pack\n", item->string); > + for_each_string_list_item(item, &existing->cruft_packs) > + fprintf(in, "-%s.pack\n", item->string); > + } If I understand correctly, we only collapse small cruft packs in case we're not expiring any objects at the same time. Is there an inherent reason why? I would imagine that it can indeed be useful to expire objects contained in cruft packs and then have git-repack(1) recombine whatever is left into larger packs. If the reason is basically "it's complicated" then that is fine with me, we can still implement the functionality at a later point in time. But until then I think that we should let callers know that the two options are incompatible with each other by producing an error when both are passed. Patrick
Attachment:
signature.asc
Description: PGP signature