On Thu, Jun 7, 2018 at 4:06 PM Derrick Stolee <stolee@xxxxxxxxx> wrote: > > Before writing a list of objects and their offsets to a multi-pack-index > (MIDX), we need to collect the list of objects contained in the > packfiles. There may be multiple copies of some objects, so this list > must be deduplicated. Can you just do merge-sort with a slight modification to ignore duplicates? > > It is possible to artificially get into a state where there are many > duplicate copies of objects. That can create high memory pressure if we > are to create a list of all objects before de-duplication. To reduce > this memory pressure without a significant performance drop, > automatically group objects by the first byte of their object id. Use > the IDX fanout tables to group the data, copy to a local array, then > sort. > > Copy only the de-duplicated entries. Select the duplicate based on the > most-recent modified time of a packfile containing the object. > > Signed-off-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx> > --- > midx.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 138 insertions(+) > > diff --git a/midx.c b/midx.c > index 923acda72e..b20d52713c 100644 > --- a/midx.c > +++ b/midx.c > @@ -4,6 +4,7 @@ > #include "csum-file.h" > #include "lockfile.h" > #include "object-store.h" > +#include "packfile.h" > #include "midx.h" > > #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ > @@ -190,6 +191,140 @@ static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *p > } > } > > +static uint32_t get_pack_fanout(struct packed_git *p, uint32_t value) > +{ > + const uint32_t *level1_ofs = p->index_data; > + > + if (!level1_ofs) { > + if (open_pack_index(p)) > + return 0; > + level1_ofs = p->index_data; > + } > + > + if (p->index_version > 1) { > + level1_ofs += 2; > + } > + > + return ntohl(level1_ofs[value]); > +} Maybe keep this in packfile,c, refactor fanout code in there if necessary, keep .idx file format info in that file instead of spreading out more. > + > +struct pack_midx_entry { > + struct object_id oid; > + uint32_t pack_int_id; > + time_t pack_mtime; > + uint64_t offset; > +}; > + > +static int midx_oid_compare(const void *_a, const void *_b) > +{ > + struct pack_midx_entry *a = (struct pack_midx_entry *)_a; > + struct pack_midx_entry *b = (struct pack_midx_entry *)_b; Try not to lose "const" while typecasting. > + int cmp = oidcmp(&a->oid, &b->oid); > + > + if (cmp) > + return cmp; > + > + if (a->pack_mtime > b->pack_mtime) > + return -1; > + else if (a->pack_mtime < b->pack_mtime) > + return 1; > + > + return a->pack_int_id - b->pack_int_id; > +} > + > +static void fill_pack_entry(uint32_t pack_int_id, > + struct packed_git *p, > + uint32_t cur_object, > + struct pack_midx_entry *entry) > +{ > + if (!nth_packed_object_oid(&entry->oid, p, cur_object)) > + die("failed to located object %d in packfile", cur_object); _() > + > + entry->pack_int_id = pack_int_id; > + entry->pack_mtime = p->mtime; > + > + entry->offset = nth_packed_object_offset(p, cur_object); > +} > + > +/* > + * It is possible to artificially get into a state where there are many > + * duplicate copies of objects. That can create high memory pressure if > + * we are to create a list of all objects before de-duplication. To reduce > + * this memory pressure without a significant performance drop, automatically > + * group objects by the first byte of their object id. Use the IDX fanout > + * tables to group the data, copy to a local array, then sort. > + * > + * Copy only the de-duplicated entries (selected by most-recent modified time > + * of a packfile containing the object). > + */ > +static struct pack_midx_entry *get_sorted_entries(struct packed_git **p, > + uint32_t *perm, > + uint32_t nr_packs, > + uint32_t *nr_objects) > +{ > + uint32_t cur_fanout, cur_pack, cur_object; > + uint32_t nr_fanout, alloc_fanout, alloc_objects, total_objects = 0; > + struct pack_midx_entry *entries_by_fanout = NULL; > + struct pack_midx_entry *deduplicated_entries = NULL; > + > + for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) { > + if (open_pack_index(p[cur_pack])) > + continue; Is it a big problem if you fail to open .idx for a certain pack? Should we error out and abort instead of continuing on? Later on in the second pack loop code when get_fanout return zero (failure), you don't seem to catch it and skip the pack. > + > + total_objects += p[cur_pack]->num_objects; > + } > + > + /* > + * As we de-duplicate by fanout value, we expect the fanout > + * slices to be evenly distributed, with some noise. Hence, > + * allocate slightly more than one 256th. > + */ > + alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16; > + > + ALLOC_ARRAY(entries_by_fanout, alloc_fanout); > + ALLOC_ARRAY(deduplicated_entries, alloc_objects); > + *nr_objects = 0; > + > + for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) { > + nr_fanout = 0; Keep variable scope small, declare nr_fanout here instead of at the top of the function. > + > + for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) { > + uint32_t start = 0, end; > + > + if (cur_fanout) > + start = get_pack_fanout(p[cur_pack], cur_fanout - 1); > + end = get_pack_fanout(p[cur_pack], cur_fanout); > + > + for (cur_object = start; cur_object < end; cur_object++) { > + ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout); > + fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]); > + nr_fanout++; > + } > + } > + > + QSORT(entries_by_fanout, nr_fanout, midx_oid_compare); > + > + /* > + * The batch is now sorted by OID and then mtime (descending). > + * Take only the first duplicate. > + */ > + for (cur_object = 0; cur_object < nr_fanout; cur_object++) { > + if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid, > + &entries_by_fanout[cur_object].oid)) > + continue; > + > + ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects); > + memcpy(&deduplicated_entries[*nr_objects], > + &entries_by_fanout[cur_object], > + sizeof(struct pack_midx_entry)); > + (*nr_objects)++; > + } > + } > + > + FREE_AND_NULL(entries_by_fanout); > + return deduplicated_entries; > +} > + > static size_t write_midx_pack_lookup(struct hashfile *f, > char **pack_names, > uint32_t nr_packs) > @@ -254,6 +389,7 @@ int write_midx_file(const char *object_dir) > uint64_t written = 0; > uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; > uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; > + uint32_t nr_entries; > > midx_name = get_midx_filename(object_dir); > if (safe_create_leading_directories(midx_name)) { > @@ -312,6 +448,8 @@ int write_midx_file(const char *object_dir) > ALLOC_ARRAY(pack_perm, nr_packs); > sort_packs_by_name(pack_names, nr_packs, pack_perm); > > + get_sorted_entries(packs, pack_perm, nr_packs, &nr_entries); Intentional ignoring return value (and temporary leaking as a result) should have a least a comment to acknowledge it and save reviewers some head scratching. Or even better, just free it now, even if you don't use it. > + > hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR); > f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); > FREE_AND_NULL(midx_name); > -- > 2.18.0.rc1 > -- Duy