Hi, Here is a third reroll of my series to introduce an on-disk format for the reverse index. Since the first series (to introduce a new API) has been merged to 'next', this series has been rebased onto 'next', too. This version is largely unchanged from the previous, with the following exceptions: - Some commit messages and documentation have been clarified to address reviewer questions (including why we don't want to store offsets in the .rev file, and that all four-byte numbers are in network order, etc.). - A few more things are done on opening the revindex file, namely checking that the version/hash ID are known. - The idiom "load in memory" has been removed. - Other minor changes. A range diff is below. Thanks in advance for your review. Taylor Blau (10): packfile: prepare for the existence of '*.rev' files pack-write.c: prepare to write 'pack-*.rev' files builtin/index-pack.c: allow stripping arbitrary extensions builtin/index-pack.c: write reverse indexes builtin/pack-objects.c: respect 'pack.writeReverseIndex' Documentation/config/pack.txt: advertise 'pack.writeReverseIndex' t: prepare for GIT_TEST_WRITE_REV_INDEX t: support GIT_TEST_WRITE_REV_INDEX pack-revindex: ensure that on-disk reverse indexes are given precedence t5325: check both on-disk and in-memory reverse index Documentation/config/pack.txt | 7 ++ Documentation/git-index-pack.txt | 18 ++- Documentation/technical/pack-format.txt | 20 ++++ builtin/index-pack.c | 69 +++++++++-- builtin/pack-objects.c | 9 ++ builtin/repack.c | 1 + ci/run-build-and-tests.sh | 1 + object-store.h | 3 + pack-revindex.c | 148 ++++++++++++++++++++++-- pack-revindex.h | 14 ++- pack-write.c | 120 ++++++++++++++++++- pack.h | 4 + packfile.c | 13 ++- packfile.h | 1 + t/README | 3 + t/t5319-multi-pack-index.sh | 5 +- t/t5325-reverse-index.sh | 142 +++++++++++++++++++++++ t/t5604-clone-reference.sh | 2 +- t/t5702-protocol-v2.sh | 12 +- t/t6500-gc.sh | 6 +- t/t9300-fast-import.sh | 5 +- tmp-objdir.c | 6 +- 22 files changed, 567 insertions(+), 42 deletions(-) create mode 100755 t/t5325-reverse-index.sh Range-diff against v2: 21: 6742c15c84 ! 1: 6f8b70ab27 packfile: prepare for the existence of '*.rev' files @@ Commit message is versioned, so we are free to pursue these in the future.) But, cold cache performance likely isn't interesting outside of one-off cases like asking for the size of an object directly. In real-world usage, Git is - often performing many operations in the revindex, + often performing many operations in the revindex (i.e., asking about + many objects rather than a single one). The trade-off is worth it, since we will avoid the vast majority of the cost of generating the revindex that the extra pointer chase will look @@ Documentation/technical/pack-format.txt: Pack file entry: <+ + + - A 4-byte magic number '0x52494458' ('RIDX'). + -+ - A 4-byte version identifier (= 1) ++ - A 4-byte version identifier (= 1). + -+ - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256) ++ - A 4-byte hash function identifier (= 1 for SHA-1, 2 for SHA-256). + -+ - A table of index positions, sorted by their corresponding offsets in the -+ packfile. ++ - A table of index positions (one per packed object, num_objects in ++ total, each a 4-byte unsigned integer in network order), sorted by ++ their corresponding offsets in the packfile. + + - A trailer, containing a: + + checksum of the corresponding packfile, and + + a checksum of all of the above. ++ ++All 4-byte numbers are in network order. + == multi-pack-index (MIDX) files have the following format: @@ object-store.h: struct packed_git { multi_pack_index:1; unsigned char hash[GIT_MAX_RAWSZ]; struct revindex_entry *revindex; -+ const void *revindex_data; -+ const void *revindex_map; ++ const uint32_t *revindex_data; ++ const uint32_t *revindex_map; + size_t revindex_size; /* something like ".git/objects/pack/xxxxx.pack" */ char pack_name[FLEX_ARRAY]; /* more */ @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) } -int load_pack_revindex(struct packed_git *p) -+static int load_pack_revindex_from_memory(struct packed_git *p) ++static int create_pack_revindex_in_memory(struct packed_git *p) { - if (!p->revindex) { - if (open_pack_index(p)) @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) + return xstrfmt("%.*s.rev", (int)len, p->pack_name); +} + -+#define RIDX_MIN_SIZE (12 + (2 * the_hash_algo->rawsz)) ++#define RIDX_HEADER_SIZE (12) ++#define RIDX_MIN_SIZE (RIDX_HEADER_SIZE + (2 * the_hash_algo->rawsz)) ++ ++struct revindex_header { ++ uint32_t signature; ++ uint32_t version; ++ uint32_t hash_id; ++}; + +static int load_revindex_from_disk(char *revindex_name, + uint32_t num_objects, -+ const void **data, size_t *len) ++ const uint32_t **data_p, size_t *len_p) +{ + int fd, ret = 0; + struct stat st; ++ void *data = NULL; + size_t revindex_size; ++ struct revindex_header *hdr; + + fd = git_open(revindex_name); + @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) + goto cleanup; + } + -+ *len = revindex_size; -+ *data = xmmap(NULL, revindex_size, PROT_READ, MAP_PRIVATE, fd, 0); ++ data = xmmap(NULL, revindex_size, PROT_READ, MAP_PRIVATE, fd, 0); ++ hdr = data; ++ ++ if (ntohl(hdr->signature) != RIDX_SIGNATURE) { ++ ret = error(_("reverse-index file %s has unknown signature"), revindex_name); ++ goto cleanup; ++ } ++ if (ntohl(hdr->version) != 1) { ++ ret = error(_("reverse-index file %s has unsupported version %"PRIu32), ++ revindex_name, ntohl(hdr->version)); ++ goto cleanup; ++ } ++ if (!(ntohl(hdr->hash_id) == 1 || ntohl(hdr->hash_id) == 2)) { ++ ret = error(_("reverse-index file %s has unsupported hash id %"PRIu32), ++ revindex_name, ntohl(hdr->hash_id)); ++ goto cleanup; ++ } + +cleanup: ++ if (ret) { ++ if (data) ++ munmap(data, revindex_size); ++ } else { ++ *len_p = revindex_size; ++ *data_p = (const uint32_t *)data; ++ } ++ + close(fd); + return ret; +} @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) + if (ret) + goto cleanup; + -+ p->revindex_data = (char *)p->revindex_map + 12; ++ p->revindex_data = (const uint32_t *)((const char *)p->revindex_map + RIDX_HEADER_SIZE); + +cleanup: + free(revindex_name); @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) + + if (!load_pack_revindex_from_disk(p)) + return 0; -+ else if (!load_pack_revindex_from_memory(p)) ++ else if (!create_pack_revindex_in_memory(p)) + return 0; + return -1; +} @@ pack-revindex.c: int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_ + if (p->revindex) + return p->revindex[pos].nr; + else -+ return get_be32((char *)p->revindex_data + (pos * sizeof(uint32_t))); ++ return get_be32(p->revindex_data + pos); } off_t pack_pos_to_offset(struct packed_git *p, uint32_t pos) @@ pack-revindex.c: int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_ } ## pack-revindex.h ## -@@ pack-revindex.h: struct packed_git; +@@ + * can be found + */ + ++#define RIDX_SIGNATURE 0x52494458 /* "RIDX" */ ++#define RIDX_VERSION 1 ++ + struct packed_git; + /* * load_pack_revindex populates the revindex's internal data-structures for the * given pack, returning zero on success and a negative value otherwise. + * -+ * If a '.rev' file is present, it is checked for consistency, mmap'd, and -+ * pointers are assigned into it (instead of using the in-memory variant). ++ * If a '.rev' file is present it is mmap'd, and pointers are assigned into it ++ * (instead of using the in-memory variant). */ int load_pack_revindex(struct packed_git *p); @@ packfile.h: uint32_t get_pack_fanout(struct packed_git *p, uint32_t value); ## tmp-objdir.c ## @@ tmp-objdir.c: static int pack_copy_priority(const char *name) + return 1; + if (ends_with(name, ".pack")) return 2; - if (ends_with(name, ".idx")) +- if (ends_with(name, ".idx")) ++ if (ends_with(name, ".rev")) return 3; - return 4; -+ if (ends_with(name, ".rev")) ++ if (ends_with(name, ".idx")) + return 4; + return 5; } 22: 8648c87fa7 ! 2: 5efc870742 pack-write.c: prepare to write 'pack-*.rev' files @@ Commit message from within 'write_rev_file()', which is called from 'finish_tmp_packfile()'. + Similar to the process by which the reverse index is computed in memory, + these new paths also have to sort a list of objects by their offsets + within a packfile. These new paths use a qsort() (as opposed to a radix + sort), since our specialized radix sort requires a full revindex_entry + struct per object, which is more memory than we need to allocate. + + The qsort is obviously slower, but the theoretical slowdown would + require a repository with a large amount of objects, likely implying + that the time spent in, say, pack-objects during a repack would dominate + the overall runtime. + Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx> ## pack-write.c ## @@ pack-write.c: const char *write_idx_file(const char *index_name, struct pack_idx + return 0; +} + -+#define RIDX_SIGNATURE 0x52494458 /* "RIDX" */ -+#define RIDX_VERSION 1 -+ +static void write_rev_header(struct hashfile *f) +{ + uint32_t oid_version; @@ pack-write.c: void finish_tmp_packfile(struct strbuf *name_buffer, + free((void *)idx_tmp_name); } + ## pack.h ## @@ pack.h: struct pack_idx_option { @@ pack.h: struct pack_idx_option { uint32_t version; uint32_t off32_limit; -@@ pack.h: off_t write_pack_header(struct hashfile *f, uint32_t); - void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t); - char *index_pack_lockfile(int fd); +@@ pack.h: struct ref; + + void write_promisor_file(const char *promisor_name, struct ref **sought, int nr_sought); +const char *write_rev_file(const char *rev_name, struct pack_idx_entry **objects, uint32_t nr_objects, const unsigned char *hash, unsigned flags); + -: ---------- > 3: 8a3e70454b builtin/index-pack.c: allow stripping arbitrary extensions 23: 5b18ada611 ! 4: a8ee59fccf builtin/index-pack.c: write reverse indexes @@ Documentation/git-index-pack.txt: git-index-pack - Build pack index file for an OPTIONS @@ Documentation/git-index-pack.txt: OPTIONS - file is constructed from the name of packed archive - file by replacing .pack with .idx (and the program fails if the name of packed archive does not end -- with .pack). -+ with .pack). Incompatible with `--rev-index`. -+ + with .pack). + +--[no-]rev-index:: + When this flag is provided, generate a reverse index + (a `.rev` file) corresponding to the given pack. If + `--verify` is given, ensure that the existing + reverse index is correct. Takes precedence over + `pack.writeReverseIndex`. - ++ --stdin:: When this flag is provided, the pack is read from stdin + instead and a copy is then written to <pack-file>. If ## builtin/index-pack.c ## @@ @@ builtin/index-pack.c struct object_entry { struct pack_idx_entry idx; -@@ builtin/index-pack.c: static void fix_unresolved_deltas(struct hashfile *f) - free(sorted_by_pos); - } - --static const char *derive_filename(const char *pack_name, const char *suffix, -- struct strbuf *buf) -+static const char *derive_filename(const char *pack_name, const char *strip, -+ const char *suffix, struct strbuf *buf) - { - size_t len; -- if (!strip_suffix(pack_name, ".pack", &len)) -- die(_("packfile name '%s' does not end with '.pack'"), -- pack_name); -+ if (!strip_suffix(pack_name, strip, &len)) -+ die(_("packfile name '%s' does not end with '%s'"), -+ pack_name, strip); - strbuf_add(buf, pack_name, len); - strbuf_addch(buf, '.'); - strbuf_addstr(buf, suffix); -@@ builtin/index-pack.c: static void write_special_file(const char *suffix, const char *msg, - int msg_len = strlen(msg); - - if (pack_name) -- filename = derive_filename(pack_name, suffix, &name_buf); -+ filename = derive_filename(pack_name, ".pack", suffix, &name_buf); - else - filename = odb_pack_name(&name_buf, hash, suffix); - @@ builtin/index-pack.c: static void write_special_file(const char *suffix, const char *msg, static void final(const char *final_pack_name, const char *curr_pack_name, @@ builtin/index-pack.c: int cmd_index_pack(int argc, const char **argv, const char usage(index_pack_usage); continue; @@ builtin/index-pack.c: int cmd_index_pack(int argc, const char **argv, const char *prefix) - if (from_stdin && hash_algo) - die(_("--object-format cannot be used with --stdin")); if (!index_name && pack_name) -- index_name = derive_filename(pack_name, "idx", &index_name_buf); -+ index_name = derive_filename(pack_name, ".pack", "idx", &index_name_buf); -+ + index_name = derive_filename(pack_name, "pack", "idx", &index_name_buf); + + opts.flags &= ~(WRITE_REV | WRITE_REV_VERIFY); + if (rev_index) { + opts.flags |= verify ? WRITE_REV_VERIFY : WRITE_REV; + if (index_name) + rev_index_name = derive_filename(index_name, -+ ".idx", "rev", ++ "idx", "rev", + &rev_index_name_buf); + } - ++ if (verify) { if (!index_name) + die(_("--verify with no packfile name given")); @@ builtin/index-pack.c: int cmd_index_pack(int argc, const char **argv, const char *prefix) for (i = 0; i < nr_objects; i++) idx_objects[i] = &objects[i].idx; 24: 68bde3ea97 = 5: 5bebe05a16 builtin/pack-objects.c: respect 'pack.writeReverseIndex' 25: 38a253d0ce = 6: 7e29f2d3a0 Documentation/config/pack.txt: advertise 'pack.writeReverseIndex' 26: 12cdf2d67a = 7: 7cf16485cc t: prepare for GIT_TEST_WRITE_REV_INDEX 27: 6b647d9775 ! 8: 02550a251d t: support GIT_TEST_WRITE_REV_INDEX @@ builtin/pack-objects.c: int cmd_pack_objects(int argc, const char **argv, const ## ci/run-build-and-tests.sh ## @@ ci/run-build-and-tests.sh: linux-gcc) - export GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1 export GIT_TEST_MULTI_PACK_INDEX=1 export GIT_TEST_ADD_I_USE_BUILTIN=1 + export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=master + export GIT_TEST_WRITE_REV_INDEX=1 make test ;; @@ pack-revindex.h * can be found */ ++ + #define RIDX_SIGNATURE 0x52494458 /* "RIDX" */ + #define RIDX_VERSION 1 + +#define GIT_TEST_WRITE_REV_INDEX "GIT_TEST_WRITE_REV_INDEX" + struct packed_git; 28: 48926ae182 ! 9: a66d2f9f7c pack-revindex: ensure that on-disk reverse indexes are given precedence @@ pack-revindex.c off_t offset; @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) - static int load_pack_revindex_from_memory(struct packed_git *p) + static int create_pack_revindex_in_memory(struct packed_git *p) { + if (git_env_bool(GIT_TEST_REV_INDEX_DIE_IN_MEMORY, 0)) + die("dying as requested by '%s'", @@ pack-revindex.c: static void create_pack_revindex(struct packed_git *p) ## pack-revindex.h ## @@ - */ + #define RIDX_VERSION 1 #define GIT_TEST_WRITE_REV_INDEX "GIT_TEST_WRITE_REV_INDEX" +#define GIT_TEST_REV_INDEX_DIE_IN_MEMORY "GIT_TEST_REV_INDEX_DIE_IN_MEMORY" -: ---------- > 10: 38c8afabf2 t5325: check both on-disk and in-memory reverse index -- 2.30.0.138.g6d7191ea01