It is possible to artificially get into a state where there are many
duplicate copies of objects. That can create high memory pressure if we
are to create a list of all objects before de-duplication. To reduce
this memory pressure without a significant performance drop,
automatically group objects by the first byte of their object id. Use
the IDX fanout tables to group the data, copy to a local array, then
sort.
Copy only the de-duplicated entries. Select the duplicate based on the
most-recent modified time of a packfile containing the object.
Signed-off-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx>
---
midx.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 138 insertions(+)
diff --git a/midx.c b/midx.c
index 923acda72e..b20d52713c 100644
--- a/midx.c
+++ b/midx.c
@@ -4,6 +4,7 @@
#include "csum-file.h"
#include "lockfile.h"
#include "object-store.h"
+#include "packfile.h"
#include "midx.h"
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
@@ -190,6 +191,140 @@ static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *p
}
}
+static uint32_t get_pack_fanout(struct packed_git *p, uint32_t value)
+{
+ const uint32_t *level1_ofs = p->index_data;
+
+ if (!level1_ofs) {
+ if (open_pack_index(p))
+ return 0;
+ level1_ofs = p->index_data;
+ }
+
+ if (p->index_version > 1) {
+ level1_ofs += 2;
+ }
+
+ return ntohl(level1_ofs[value]);
+}
Maybe keep this in packfile,c, refactor fanout code in there if
necessary, keep .idx file format info in that file instead of
spreading out more.
+
+struct pack_midx_entry {
+ struct object_id oid;
+ uint32_t pack_int_id;
+ time_t pack_mtime;
+ uint64_t offset;
+};
+
+static int midx_oid_compare(const void *_a, const void *_b)
+{
+ struct pack_midx_entry *a = (struct pack_midx_entry *)_a;
+ struct pack_midx_entry *b = (struct pack_midx_entry *)_b;
Try not to lose "const" while typecasting.
+ int cmp = oidcmp(&a->oid, &b->oid);
+
+ if (cmp)
+ return cmp;
+
+ if (a->pack_mtime > b->pack_mtime)
+ return -1;
+ else if (a->pack_mtime < b->pack_mtime)
+ return 1;
+
+ return a->pack_int_id - b->pack_int_id;
+}
+
+static void fill_pack_entry(uint32_t pack_int_id,
+ struct packed_git *p,
+ uint32_t cur_object,
+ struct pack_midx_entry *entry)
+{
+ if (!nth_packed_object_oid(&entry->oid, p, cur_object))
+ die("failed to located object %d in packfile", cur_object);
_()
+
+ entry->pack_int_id = pack_int_id;
+ entry->pack_mtime = p->mtime;
+
+ entry->offset = nth_packed_object_offset(p, cur_object);
+}
+
+/*
+ * It is possible to artificially get into a state where there are many
+ * duplicate copies of objects. That can create high memory pressure if
+ * we are to create a list of all objects before de-duplication. To reduce
+ * this memory pressure without a significant performance drop, automatically
+ * group objects by the first byte of their object id. Use the IDX fanout
+ * tables to group the data, copy to a local array, then sort.
+ *
+ * Copy only the de-duplicated entries (selected by most-recent modified time
+ * of a packfile containing the object).
+ */
+static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
+ uint32_t *perm,
+ uint32_t nr_packs,
+ uint32_t *nr_objects)
+{
+ uint32_t cur_fanout, cur_pack, cur_object;
+ uint32_t nr_fanout, alloc_fanout, alloc_objects, total_objects = 0;
+ struct pack_midx_entry *entries_by_fanout = NULL;
+ struct pack_midx_entry *deduplicated_entries = NULL;
+
+ for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+ if (open_pack_index(p[cur_pack]))
+ continue;
Is it a big problem if you fail to open .idx for a certain pack?
Should we error out and abort instead of continuing on? Later on in
the second pack loop code when get_fanout return zero (failure), you
don't seem to catch it and skip the pack.
+
+ total_objects += p[cur_pack]->num_objects;
+ }
+
+ /*
+ * As we de-duplicate by fanout value, we expect the fanout
+ * slices to be evenly distributed, with some noise. Hence,
+ * allocate slightly more than one 256th.
+ */
+ alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16;
+
+ ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
+ ALLOC_ARRAY(deduplicated_entries, alloc_objects);
+ *nr_objects = 0;
+
+ for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
+ nr_fanout = 0;
Keep variable scope small, declare nr_fanout here instead of at the
top of the function.
+
+ for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+ uint32_t start = 0, end;
+
+ if (cur_fanout)
+ start = get_pack_fanout(p[cur_pack], cur_fanout - 1);
+ end = get_pack_fanout(p[cur_pack], cur_fanout);
+
+ for (cur_object = start; cur_object < end; cur_object++) {
+ ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
+ fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]);
+ nr_fanout++;
+ }
+ }
+
+ QSORT(entries_by_fanout, nr_fanout, midx_oid_compare);
+
+ /*
+ * The batch is now sorted by OID and then mtime (descending).
+ * Take only the first duplicate.
+ */
+ for (cur_object = 0; cur_object < nr_fanout; cur_object++) {
+ if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid,
+ &entries_by_fanout[cur_object].oid))
+ continue;
+
+ ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects);
+ memcpy(&deduplicated_entries[*nr_objects],
+ &entries_by_fanout[cur_object],
+ sizeof(struct pack_midx_entry));
+ (*nr_objects)++;
+ }
+ }
+
+ FREE_AND_NULL(entries_by_fanout);
+ return deduplicated_entries;
+}
+
static size_t write_midx_pack_lookup(struct hashfile *f,
char **pack_names,
uint32_t nr_packs)
@@ -254,6 +389,7 @@ int write_midx_file(const char *object_dir)
uint64_t written = 0;
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
+ uint32_t nr_entries;
midx_name = get_midx_filename(object_dir);
if (safe_create_leading_directories(midx_name)) {
@@ -312,6 +448,8 @@ int write_midx_file(const char *object_dir)
ALLOC_ARRAY(pack_perm, nr_packs);
sort_packs_by_name(pack_names, nr_packs, pack_perm);
+ get_sorted_entries(packs, pack_perm, nr_packs, &nr_entries);
Intentional ignoring return value (and temporary leaking as a result)
should have a least a comment to acknowledge it and save reviewers
some head scratching. Or even better, just free it now, even if you
don't use it.
+
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
FREE_AND_NULL(midx_name);
--
2.18.0.rc1