On 12/08/18 06:11, Christian Couder wrote: > From: Jeff King <peff@xxxxxxxx> > > Hosting providers that allow users to "fork" existing > repos want those forks to share as much disk space as > possible. > > Alternates are an existing solution to keep all the > objects from all the forks into a unique central repo, > but this can have some drawbacks. Especially when > packing the central repo, deltas will be created > between objects from different forks. > > This can make cloning or fetching a fork much slower > and much more CPU intensive as Git might have to > compute new deltas for many objects to avoid sending > objects from a different fork. > > Because the inefficiency primarily arises when an > object is delitified against another object that does s/delitified/deltified/ ? > not exist in the same fork, we partition objects into > sets that appear in the same fork, and define > "delta islands". When finding delta base, we do not > allow an object outside the same island to be > considered as its base. > > So "delta islands" is a way to store objects from > different forks in the same repo and packfile without > having deltas between objects from different forks. > > This patch implements the delta islands mechanism in > "delta-islands.{c,h}", but does not yet make use of it. > > A few new fields are added in 'struct object_entry' > in "pack-objects.h" though. > > The documentation will follow in a patch that actually > uses delta islands in "builtin/pack-objects.c". > > Signed-off-by: Jeff King <peff@xxxxxxxx> > Signed-off-by: Christian Couder <chriscool@xxxxxxxxxxxxx> > --- > Makefile | 1 + > delta-islands.c | 493 ++++++++++++++++++++++++++++++++++++++++++++++++ > delta-islands.h | 11 ++ > pack-objects.h | 4 + > 4 files changed, 509 insertions(+) > create mode 100644 delta-islands.c > create mode 100644 delta-islands.h > > diff --git a/Makefile b/Makefile > index bc4fc8eeab..e7994888e8 100644 > --- a/Makefile > +++ b/Makefile > @@ -841,6 +841,7 @@ LIB_OBJS += csum-file.o > LIB_OBJS += ctype.o > LIB_OBJS += date.o > LIB_OBJS += decorate.o > +LIB_OBJS += delta-islands.o > LIB_OBJS += diffcore-break.o > LIB_OBJS += diffcore-delta.o > LIB_OBJS += diffcore-order.o > diff --git a/delta-islands.c b/delta-islands.c > new file mode 100644 > index 0000000000..c0b0da6837 > --- /dev/null > +++ b/delta-islands.c > @@ -0,0 +1,493 @@ > +#include "cache.h" > +#include "attr.h" > +#include "object.h" > +#include "blob.h" > +#include "commit.h" > +#include "tag.h" > +#include "tree.h" > +#include "delta.h" > +#include "pack.h" > +#include "tree-walk.h" > +#include "diff.h" > +#include "revision.h" > +#include "list-objects.h" > +#include "progress.h" > +#include "refs.h" > +#include "khash.h" I was wondering how many copies of the inline functions introduced by this header we had, so: $ nm git | grep ' t ' | cut -d' ' -f3 | sort | uniq -c | sort -rn | grep kh_ 3 kh_resize_sha1 3 kh_put_sha1 3 kh_init_sha1 3 kh_get_sha1 1 kh_resize_str 1 kh_resize_sha1_pos 1 kh_put_str 1 kh_put_sha1_pos 1 kh_init_str 1 kh_init_sha1_pos 1 kh_get_str 1 kh_get_sha1_pos 1 kh_destroy_sha1 $ Looking at the individual object files, we see: $ nm pack-bitmap-write.o | grep ' t ' | grep kh_ 00000000000001cc t kh_get_sha1 00000000000001b7 t kh_init_sha1 000000000000085e t kh_put_sha1 0000000000000310 t kh_resize_sha1 $ So, the two instances of the sha1 hash-map are never destroyed (kh_destroy_sha1 is not present in the object file). $ nm pack-bitmap.o | grep ' t ' | grep kh_ 00000000000002d9 t kh_destroy_sha1 000000000000032b t kh_get_sha1 0000000000000daa t kh_get_sha1_pos 00000000000002c4 t kh_init_sha1 0000000000000d95 t kh_init_sha1_pos 00000000000009bd t kh_put_sha1 0000000000001432 t kh_put_sha1_pos 000000000000046f t kh_resize_sha1 0000000000000eee t kh_resize_sha1_pos $ The sha1_pos hash-map is not destroyed here. $ nm delta-islands.o | grep ' t ' | grep kh_ 00000000000002be t kh_get_sha1 0000000000000e52 t kh_get_str 00000000000002a9 t kh_init_sha1 0000000000000e3d t kh_init_str 0000000000000950 t kh_put_sha1 00000000000014e4 t kh_put_str 0000000000000402 t kh_resize_sha1 0000000000000f96 t kh_resize_str $ And neither the sha1 or str hash-maps are destroyed here. (That is not necessarily a problem, of course! ;-) ) > +#include "pack-bitmap.h" > +#include "pack-objects.h" > +#include "delta-islands.h" > +#include "sha1-array.h" > +#include "config.h" > + > +KHASH_INIT(str, const char *, void *, 1, kh_str_hash_func, kh_str_hash_equal) > + > +static khash_sha1 *island_marks; > +static unsigned island_counter; > +static unsigned island_counter_core; > + > +static kh_str_t *remote_islands; > + > +struct remote_island { > + uint64_t hash; > + struct oid_array oids; > +}; > + > +struct island_bitmap { > + uint32_t refcount; > + uint32_t bits[]; Use FLEX_ARRAY here? We are slowly moving toward requiring certain C99 features, but I can't remember a flex array weather-balloon patch. > +}; > + > +static uint32_t island_bitmap_size; > + > +/* > + * Allocate a new bitmap; if "old" is not NULL, the new bitmap will be a copy > + * of "old". Otherwise, the new bitmap is empty. > + */ > +static struct island_bitmap *island_bitmap_new(const struct island_bitmap *old) > +{ > + size_t size = sizeof(struct island_bitmap) + (island_bitmap_size * 4); > + struct island_bitmap *b = xcalloc(1, size); > + > + if (old) > + memcpy(b, old, size); > + > + b->refcount = 1; > + return b; > +} > + > +static void island_bitmap_or(struct island_bitmap *a, const struct island_bitmap *b) > +{ > + uint32_t i; > + > + for (i = 0; i < island_bitmap_size; ++i) > + a->bits[i] |= b->bits[i]; > +} > + > +static int island_bitmap_is_subset(struct island_bitmap *self, > + struct island_bitmap *super) > +{ > + uint32_t i; > + > + if (self == super) > + return 1; > + > + for (i = 0; i < island_bitmap_size; ++i) { > + if ((self->bits[i] & super->bits[i]) != self->bits[i]) > + return 0; > + } > + > + return 1; > +} > + > +#define ISLAND_BITMAP_BLOCK(x) (x / 32) > +#define ISLAND_BITMAP_MASK(x) (1 << (x % 32)) > + > +static void island_bitmap_set(struct island_bitmap *self, uint32_t i) > +{ > + self->bits[ISLAND_BITMAP_BLOCK(i)] |= ISLAND_BITMAP_MASK(i); > +} > + > +static int island_bitmap_get(struct island_bitmap *self, uint32_t i) > +{ > + return (self->bits[ISLAND_BITMAP_BLOCK(i)] & ISLAND_BITMAP_MASK(i)) != 0; > +} > + > +int in_same_island(const struct object_id *trg_oid, const struct object_id *src_oid) Hmm, what does the trg_ prefix stand for? > +{ > + khiter_t trg_pos, src_pos; > + > + /* If we aren't using islands, assume everything goes together. */ > + if (!island_marks) > + return 1; > + > + /* > + * If we don't have a bitmap for the target, we can delta it ... Ah, OK, trg_ => target. > + * against anything -- it's not an important object > + */ > + trg_pos = kh_get_sha1(island_marks, trg_oid->hash); > + if (trg_pos >= kh_end(island_marks)) > + return 1; > + > + /* > + * if the source (our delta base) doesn't have a bitmap, > + * we don't want to base any deltas on it! > + */ > + src_pos = kh_get_sha1(island_marks, src_oid->hash); > + if (src_pos >= kh_end(island_marks)) > + return 0; > + > + return island_bitmap_is_subset(kh_value(island_marks, trg_pos), > + kh_value(island_marks, src_pos)); > +} > + > +int island_delta_cmp(const struct object_id *a, const struct object_id *b) > +{ > + khiter_t a_pos, b_pos; > + struct island_bitmap *a_bitmap = NULL, *b_bitmap = NULL; > + > + if (!island_marks) > + return 0; > + > + a_pos = kh_get_sha1(island_marks, a->hash); > + if (a_pos < kh_end(island_marks)) > + a_bitmap = kh_value(island_marks, a_pos); > + > + b_pos = kh_get_sha1(island_marks, b->hash); > + if (b_pos < kh_end(island_marks)) > + b_bitmap = kh_value(island_marks, b_pos); > + > + if (a_bitmap) { > + if (!b_bitmap || !island_bitmap_is_subset(a_bitmap, b_bitmap)) > + return -1; > + } > + if (b_bitmap) { > + if (!a_bitmap || !island_bitmap_is_subset(b_bitmap, a_bitmap)) > + return 1; > + } > + > + return 0; > +} > + > +static struct island_bitmap *create_or_get_island_marks(struct object *obj) > +{ > + khiter_t pos; > + int hash_ret; > + > + pos = kh_put_sha1(island_marks, obj->oid.hash, &hash_ret); > + if (hash_ret) > + kh_value(island_marks, pos) = island_bitmap_new(NULL); > + > + return kh_value(island_marks, pos); > +} > + > +static void set_island_marks(struct object *obj, struct island_bitmap *marks) > +{ > + struct island_bitmap *b; > + khiter_t pos; > + int hash_ret; > + > + pos = kh_put_sha1(island_marks, obj->oid.hash, &hash_ret); > + if (hash_ret) { > + /* > + * We don't have one yet; make a copy-on-write of the > + * parent. > + */ > + marks->refcount++; > + kh_value(island_marks, pos) = marks; > + return; > + } > + > + /* > + * We do have it. Make sure we split any copy-on-write before > + * updating. > + */ > + b = kh_value(island_marks, pos); > + if (b->refcount > 1) { > + b->refcount--; > + b = kh_value(island_marks, pos) = island_bitmap_new(b); > + } > + island_bitmap_or(b, marks); > +} > + > +static void mark_remote_island_1(struct remote_island *rl, int is_core_island) > +{ > + uint32_t i; > + > + for (i = 0; i < rl->oids.nr; ++i) { > + struct island_bitmap *marks; > + struct object *obj = parse_object(the_repository, &rl->oids.oid[i]); > + > + if (!obj) > + continue; > + > + marks = create_or_get_island_marks(obj); > + island_bitmap_set(marks, island_counter); > + > + if (is_core_island && obj->type == OBJ_COMMIT) > + obj->flags |= NEEDS_BITMAP; > + > + /* If it was a tag, also make sure we hit the underlying object. */ > + while (obj && obj->type == OBJ_TAG) { > + obj = ((struct tag *)obj)->tagged; > + if (obj) { > + parse_object(the_repository, &obj->oid); > + marks = create_or_get_island_marks(obj); > + island_bitmap_set(marks, island_counter); > + } > + } > + } > + > + if (is_core_island) > + island_counter_core = island_counter; > + > + island_counter++; > +} > + > +static int cmp_tree_depth(const void *va, const void *vb) > +{ > + struct object_entry *a = *(struct object_entry **)va; > + struct object_entry *b = *(struct object_entry **)vb; > + return a->tree_depth - b->tree_depth; > +} > + > +void resolve_tree_islands(int progress, struct packing_data *to_pack) > +{ > + struct progress *progress_state = NULL; > + struct object_entry **todo; > + int nr = 0; > + int i; > + > + if (!island_marks) > + return; > + > + /* > + * We process only trees, as commits and tags have already been handled > + * (and passed their marks on to root trees, as well. We must make sure > + * to process them in descending tree-depth order so that marks > + * propagate down the tree properly, even if a sub-tree is found in > + * multiple parent trees. > + */ > + ALLOC_ARRAY(todo, to_pack->nr_objects); > + for (i = 0; i < to_pack->nr_objects; i++) { > + if (oe_type(&to_pack->objects[i]) == OBJ_TREE) > + todo[nr++] = &to_pack->objects[i]; > + } > + QSORT(todo, nr, cmp_tree_depth); > + > + if (progress) > + progress_state = start_progress(_("Propagating island marks"), nr); > + > + for (i = 0; i < nr; i++) { > + struct object_entry *ent = todo[i]; > + struct island_bitmap *root_marks; > + struct tree *tree; > + struct tree_desc desc; > + struct name_entry entry; > + khiter_t pos; > + > + pos = kh_get_sha1(island_marks, ent->idx.oid.hash); > + if (pos >= kh_end(island_marks)) > + continue; > + > + root_marks = kh_value(island_marks, pos); > + > + tree = lookup_tree(the_repository, &ent->idx.oid); > + if (!tree || parse_tree(tree) < 0) > + die(_("bad tree object %s"), oid_to_hex(&ent->idx.oid)); > + > + init_tree_desc(&desc, tree->buffer, tree->size); > + while (tree_entry(&desc, &entry)) { > + struct object *obj; > + > + if (S_ISGITLINK(entry.mode)) > + continue; > + > + obj = lookup_object(the_repository, entry.oid->hash); > + if (!obj) > + continue; > + > + set_island_marks(obj, root_marks); > + } > + > + free_tree_buffer(tree); > + > + display_progress(progress_state, i+1); > + } > + > + stop_progress(&progress_state); > + free(todo); > +} > + > +static regex_t *island_regexes; > +static unsigned int island_regexes_alloc, island_regexes_nr; > +static const char *core_island_name; > + > +static int island_config_callback(const char *k, const char *v, void *cb) > +{ > + if (!strcmp(k, "pack.island")) { > + struct strbuf re = STRBUF_INIT; > + > + if (!v) > + return config_error_nonbool(k); > + > + ALLOC_GROW(island_regexes, island_regexes_nr + 1, island_regexes_alloc); > + > + if (*v != '^') > + strbuf_addch(&re, '^'); > + strbuf_addstr(&re, v); > + > + if (regcomp(&island_regexes[island_regexes_nr], re.buf, REG_EXTENDED)) > + die(_("failed to load island regex for '%s': %s"), k, re.buf); > + > + strbuf_release(&re); > + island_regexes_nr++; > + return 0; > + } > + > + if (!strcmp(k, "pack.islandcore")) > + return git_config_string(&core_island_name, k, v); > + > + return 0; > +} > + > +static void add_ref_to_island(const char *island_name, const struct object_id *oid) > +{ > + uint64_t sha_core; > + struct remote_island *rl = NULL; > + > + int hash_ret; > + khiter_t pos = kh_put_str(remote_islands, island_name, &hash_ret); > + > + if (hash_ret) { > + kh_key(remote_islands, pos) = xstrdup(island_name); > + kh_value(remote_islands, pos) = xcalloc(1, sizeof(struct remote_island)); > + } > + > + rl = kh_value(remote_islands, pos); > + oid_array_append(&rl->oids, oid); > + > + memcpy(&sha_core, oid->hash, sizeof(uint64_t)); > + rl->hash += sha_core; Hmm, so the first 64-bits of the oid of each ref that is part of this island is added together as a 'hash' for the island. And this is used to de-duplicate the islands? Any false positives? (does it matter - it would only affect performance, not correctness, right?) > +} > + > +static int find_island_for_ref(const char *refname, const struct object_id *oid, > + int flags, void *data) > +{ > + /* > + * We should advertise 'ARRAY_SIZE(matches) - 2' as the max, > + * so we can diagnose below a config with more capture groups > + * than we support. > + */ > + regmatch_t matches[16]; > + int i, m; > + struct strbuf island_name = STRBUF_INIT; > + > + /* walk backwards to get last-one-wins ordering */ > + for (i = island_regexes_nr - 1; i >= 0; i--) { > + if (!regexec(&island_regexes[i], refname, > + ARRAY_SIZE(matches), matches, 0)) > + break; > + } > + > + if (i < 0) > + return 0; > + > + if (matches[ARRAY_SIZE(matches) - 1].rm_so != -1) > + warning(_("island regex from config has " > + "too many capture groups (max=%d)"), > + (int)ARRAY_SIZE(matches) - 2); > + > + for (m = 1; m < ARRAY_SIZE(matches); m++) { > + regmatch_t *match = &matches[m]; > + > + if (match->rm_so == -1) > + continue; > + > + if (island_name.len) > + strbuf_addch(&island_name, '-'); > + > + strbuf_add(&island_name, refname + match->rm_so, match->rm_eo - match->rm_so); > + } > + > + add_ref_to_island(island_name.buf, oid); > + strbuf_release(&island_name); > + return 0; > +} > + > +static struct remote_island *get_core_island(void) > +{ > + if (core_island_name) { > + khiter_t pos = kh_get_str(remote_islands, core_island_name); > + if (pos < kh_end(remote_islands)) > + return kh_value(remote_islands, pos); > + } > + > + return NULL; > +} > + > +static void deduplicate_islands(void) > +{ > + struct remote_island *island, *core = NULL, **list; > + unsigned int island_count, dst, src, ref, i = 0; > + > + island_count = kh_size(remote_islands); > + ALLOC_ARRAY(list, island_count); > + > + kh_foreach_value(remote_islands, island, { > + list[i++] = island; > + }); > + > + for (ref = 0; ref + 1 < island_count; ref++) { > + for (src = ref + 1, dst = src; src < island_count; src++) { > + if (list[ref]->hash == list[src]->hash) > + continue; > + > + if (src != dst) > + list[dst] = list[src]; > + > + dst++; > + } > + island_count = dst; > + } > + > + island_bitmap_size = (island_count / 32) + 1; > + core = get_core_island(); > + > + for (i = 0; i < island_count; ++i) { > + mark_remote_island_1(list[i], core && list[i]->hash == core->hash); > + } > + > + free(list); > +} > + > +void load_delta_islands(void) > +{ > + island_marks = kh_init_sha1(); > + remote_islands = kh_init_str(); > + > + git_config(island_config_callback, NULL); > + for_each_ref(find_island_for_ref, NULL); > + deduplicate_islands(); > + > + fprintf(stderr, _("Marked %d islands, done.\n"), island_counter); > +} > + > +void propagate_island_marks(struct commit *commit) > +{ > + khiter_t pos = kh_get_sha1(island_marks, commit->object.oid.hash); > + > + if (pos < kh_end(island_marks)) { > + struct commit_list *p; > + struct island_bitmap *root_marks = kh_value(island_marks, pos); > + > + parse_commit(commit); > + set_island_marks(&get_commit_tree(commit)->object, root_marks); > + for (p = commit->parents; p; p = p->next) > + set_island_marks(&p->item->object, root_marks); > + } > +} > + > +int compute_pack_layers(struct packing_data *to_pack) > +{ > + uint32_t i; > + > + if (!core_island_name || !island_marks) > + return 1; > + > + for (i = 0; i < to_pack->nr_objects; ++i) { > + struct object_entry *entry = &to_pack->objects[i]; > + khiter_t pos = kh_get_sha1(island_marks, entry->idx.oid.hash); > + > + entry->layer = 1; > + > + if (pos < kh_end(island_marks)) { > + struct island_bitmap *bitmap = kh_value(island_marks, pos); > + > + if (island_bitmap_get(bitmap, island_counter_core)) > + entry->layer = 0; > + } > + } > + > + return 2; > +} > diff --git a/delta-islands.h b/delta-islands.h > new file mode 100644 > index 0000000000..f9725730f4 > --- /dev/null > +++ b/delta-islands.h > @@ -0,0 +1,11 @@ > +#ifndef DELTA_ISLANDS_H > +#define DELTA_ISLANDS_H > + > +int island_delta_cmp(const struct object_id *a, const struct object_id *b); > +int in_same_island(const struct object_id *, const struct object_id *); > +void resolve_tree_islands(int progress, struct packing_data *to_pack); > +void load_delta_islands(void); > +void propagate_island_marks(struct commit *commit); > +int compute_pack_layers(struct packing_data *to_pack); > + > +#endif /* DELTA_ISLANDS_H */ > diff --git a/pack-objects.h b/pack-objects.h > index edf74dabdd..8eecd67991 100644 > --- a/pack-objects.h > +++ b/pack-objects.h > @@ -100,6 +100,10 @@ struct object_entry { > unsigned type_:TYPE_BITS; > unsigned no_try_delta:1; > unsigned in_pack_type:TYPE_BITS; /* could be delta */ > + > + unsigned int tree_depth; /* should be repositioned for packing? */ > + unsigned char layer; > + > unsigned preferred_base:1; /* > * we do not pack this, but is available > * to be used as the base object to delta > Sorry, I spent so long reading this patch, I have run out of time tonight (and I am busy tomorrow) to read the rest of the series. ATB, Ramsay Jones