On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@xxxxxxxxxxxxx> wrote: > > The End of Index Entry (EOIE) is used to locate the end of the variable > length index entries and the beginning of the extensions. Code can take > advantage of this to quickly locate the index extensions without having > to parse through all of the index entries. > > Because it must be able to be loaded before the variable length cache > entries and other index extensions, this extension must be written last. > The signature for this extension is { 'E', 'O', 'I', 'E' }. > > The extension consists of: > > - 32-bit offset to the end of the index entries > > - 160-bit SHA-1 over the extension types and their sizes (but not > their contents). E.g. if we have "TREE" extension that is N-bytes > long, "REUC" extension that is M-bytes long, followed by "EOIE", > then the hash would be: > > SHA-1("TREE" + <binary representation of N> + > "REUC" + <binary representation of M>) > > Signed-off-by: Ben Peart <Ben.Peart@xxxxxxxxxxxxx> > --- > Documentation/technical/index-format.txt | 23 ++++ > read-cache.c | 154 +++++++++++++++++++++-- > t/README | 5 + > t/t1700-split-index.sh | 1 + > 4 files changed, 175 insertions(+), 8 deletions(-) > > diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt > index db3572626b..6bc2d90f7f 100644 > --- a/Documentation/technical/index-format.txt > +++ b/Documentation/technical/index-format.txt > @@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type: > > - An ewah bitmap, the n-th bit indicates whether the n-th index entry > is not CE_FSMONITOR_VALID. > + > +== End of Index Entry > + > + The End of Index Entry (EOIE) is used to locate the end of the variable > + length index entries and the begining of the extensions. Code can take > + advantage of this to quickly locate the index extensions without having > + to parse through all of the index entries. > + > + Because it must be able to be loaded before the variable length cache > + entries and other index extensions, this extension must be written last. > + The signature for this extension is { 'E', 'O', 'I', 'E' }. > + > + The extension consists of: > + > + - 32-bit offset to the end of the index entries > + > + - 160-bit SHA-1 over the extension types and their sizes (but not > + their contents). E.g. if we have "TREE" extension that is N-bytes > + long, "REUC" extension that is M-bytes long, followed by "EOIE", > + then the hash would be: > + > + SHA-1("TREE" + <binary representation of N> + > + "REUC" + <binary representation of M>) > diff --git a/read-cache.c b/read-cache.c > index 7b1354d759..858935f123 100644 > --- a/read-cache.c > +++ b/read-cache.c > @@ -43,6 +43,7 @@ > #define CACHE_EXT_LINK 0x6c696e6b /* "link" */ > #define CACHE_EXT_UNTRACKED 0x554E5452 /* "UNTR" */ > #define CACHE_EXT_FSMONITOR 0x46534D4E /* "FSMN" */ > +#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945 /* "EOIE" */ > > /* changes that can be kept in $GIT_DIR/index (basically all extensions) */ > #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \ > @@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate, > case CACHE_EXT_FSMONITOR: > read_fsmonitor_extension(istate, data, sz); > break; > + case CACHE_EXT_ENDOFINDEXENTRIES: > + /* already handled in do_read_index() */ > + break; Perhaps catch this extension when it's not written at the end (e.g. by some other git implementation) and warn. > default: > if (*ext < 'A' || 'Z' < *ext) > return error("index uses %.4s extension, which we do not understand", > @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries) > return ondisk_size + entries * per_entry; > } > > +#ifndef NO_PTHREADS > +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size); > +#endif Keep functions unconditionally built as much as possible. I don't see why this read_eoie_extension() must be built only on multithread platforms. > +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset); > + > /* remember to discard_cache() before reading a different cache! */ > int do_read_index(struct index_state *istate, const char *path, int must_exist) > { > @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len) > return 0; > } > > -static int write_index_ext_header(git_hash_ctx *context, int fd, > - unsigned int ext, unsigned int sz) > +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context, > + int fd, unsigned int ext, unsigned int sz) > { > ext = htonl(ext); > sz = htonl(sz); > + if (eoie_context) { > + the_hash_algo->update_fn(eoie_context, &ext, 4); > + the_hash_algo->update_fn(eoie_context, &sz, 4); > + } > return ((ce_write(context, fd, &ext, 4) < 0) || > (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0; > } > @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > { > uint64_t start = getnanotime(); > int newfd = tempfile->fd; > - git_hash_ctx c; > + git_hash_ctx c, eoie_c; > struct cache_header hdr; > int i, err = 0, removed, extended, hdr_version; > struct cache_entry **cache = istate->cache; > @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > struct ondisk_cache_entry_extended ondisk; > struct strbuf previous_name_buf = STRBUF_INIT, *previous_name; > int drop_cache_tree = istate->drop_cache_tree; > + unsigned long offset; > > for (i = removed = extended = 0; i < entries; i++) { > if (cache[i]->ce_flags & CE_REMOVE) > @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > return err; > > /* Write extension data here */ > + offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len; > + the_hash_algo->init_fn(&eoie_c); Don't write (or even calculate to write it) unless it's needed. Which means only do this when parallel reading is enabled and the index size large enough, or when a test variable is set so you can force writing this extension. I briefly wondered if we should continue writing the extension if it's already written. This way you can manually enable it with "git update-index". But I don't think it's worth the complexity. > if (!strip_extensions && istate->split_index) { > struct strbuf sb = STRBUF_INIT; > > err = write_link_extension(&sb, istate) < 0 || > - write_index_ext_header(&c, newfd, CACHE_EXT_LINK, > + write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK, > sb.len) < 0 || > ce_write(&c, newfd, sb.buf, sb.len) < 0; > strbuf_release(&sb); > @@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > struct strbuf sb = STRBUF_INIT; > > cache_tree_write(&sb, istate->cache_tree); > - err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0 > + err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0 > || ce_write(&c, newfd, sb.buf, sb.len) < 0; > strbuf_release(&sb); > if (err) > @@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > struct strbuf sb = STRBUF_INIT; > > resolve_undo_write(&sb, istate->resolve_undo); > - err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO, > + err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO, > sb.len) < 0 > || ce_write(&c, newfd, sb.buf, sb.len) < 0; > strbuf_release(&sb); > @@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > struct strbuf sb = STRBUF_INIT; > > write_untracked_extension(&sb, istate->untracked); > - err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED, > + err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED, > sb.len) < 0 || > ce_write(&c, newfd, sb.buf, sb.len) < 0; > strbuf_release(&sb); > @@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, > struct strbuf sb = STRBUF_INIT; > > write_fsmonitor_extension(&sb, istate); > - err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0 > + err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0 > + || ce_write(&c, newfd, sb.buf, sb.len) < 0; > + strbuf_release(&sb); > + if (err) > + return -1; > + } > + > + /* > + * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1 > + * so that it can be found and processed before all the index entries are > + * read. > + */ > + if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) { > + struct strbuf sb = STRBUF_INIT; > + > + write_eoie_extension(&sb, &eoie_c, offset); > + err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0 > || ce_write(&c, newfd, sb.buf, sb.len) < 0; > strbuf_release(&sb); > if (err) > @@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void) > > return validate_index_cache_entries; > } > + > +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */ > +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */ > + > +#ifndef NO_PTHREADS > +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size) > +{ > + /* > + * The end of index entries (EOIE) extension is guaranteed to be last > + * so that it can be found by scanning backwards from the EOF. > + * > + * "EOIE" > + * <4-byte length> > + * <4-byte offset> > + * <20-byte hash> > + */ > + const char *mmap = mmap_; > + const char *index, *eoie; > + uint32_t extsize; > + unsigned long offset, src_offset; > + unsigned char hash[GIT_MAX_RAWSZ]; > + git_hash_ctx c; > + > + /* ensure we have an index big enough to contain an EOIE extension */ > + if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz) > + return 0; All these "return 0" indicates an error in EOIE extension. You probably want to print some warning (much easier to track down why parallel reading does not happen). > + > + /* validate the extension signature */ > + index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz; > + if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES) > + return 0; > + index += sizeof(uint32_t); > + > + /* validate the extension size */ > + extsize = get_be32(index); > + if (extsize != EOIE_SIZE) > + return 0; > + index += sizeof(uint32_t); > + > + /* > + * Validate the offset we're going to look for the first extension > + * signature is after the index header and before the eoie extension. > + */ > + offset = get_be32(index); > + if (mmap + offset < mmap + sizeof(struct cache_header)) > + return 0; > + if (mmap + offset >= eoie) > + return 0; > + index += sizeof(uint32_t); > + > + /* > + * The hash is computed over extension types and their sizes (but not > + * their contents). E.g. if we have "TREE" extension that is N-bytes > + * long, "REUC" extension that is M-bytes long, followed by "EOIE", > + * then the hash would be: > + * > + * SHA-1("TREE" + <binary representation of N> + > + * "REUC" + <binary representation of M>) > + */ > + src_offset = offset; > + the_hash_algo->init_fn(&c); > + while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) { > + /* After an array of active_nr index entries, > + * there can be arbitrary number of extended > + * sections, each of which is prefixed with > + * extension name (4-byte) and section length > + * in 4-byte network byte order. > + */ > + uint32_t extsize; > + memcpy(&extsize, (char *)mmap + src_offset + 4, 4); > + extsize = ntohl(extsize); > + > + /* verify the extension size isn't so large it will wrap around */ > + if (src_offset + 8 + extsize < src_offset) > + return 0; > + > + the_hash_algo->update_fn(&c, mmap + src_offset, 8); > + > + src_offset += 8; > + src_offset += extsize; > + } > + the_hash_algo->final_fn(hash, &c); > + if (hashcmp(hash, (const unsigned char *)index)) > + return 0; > + > + /* Validate that the extension offsets returned us back to the eoie extension. */ > + if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) > + return 0; > + > + return offset; > +} > +#endif > + > +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset) We normally just put function implementations before it's used to avoid static forward declaration. Any special reason why it's not done here? > +{ > + uint32_t buffer; > + unsigned char hash[GIT_MAX_RAWSZ]; > + > + /* offset */ > + put_be32(&buffer, offset); > + strbuf_add(sb, &buffer, sizeof(uint32_t)); > + > + /* hash */ > + the_hash_algo->final_fn(hash, eoie_context); > + strbuf_add(sb, hash, the_hash_algo->rawsz); > +} > diff --git a/t/README b/t/README > index 9028b47d92..d8754dd23a 100644 > --- a/t/README > +++ b/t/README > @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code > path where deltas larger than this limit require extra memory > allocation for bookkeeping. > > +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension. > +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed I have a feeling that you won't have problems if you don't write eoie extension by default in the first place. Then this could be switched to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both eoie and split index are forced on when running through the test suite, but that should be an easy fix. > +as they currently hard code SHA values for the index which are no longer > +valid due to the addition of the EOIE extension. > + > Naming Tests > ------------ > > diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh > index 39133bcbc8..f613dd72e3 100755 > --- a/t/t1700-split-index.sh > +++ b/t/t1700-split-index.sh > @@ -7,6 +7,7 @@ test_description='split index mode tests' > # We need total control of index splitting here > sane_unset GIT_TEST_SPLIT_INDEX > sane_unset GIT_FSMONITOR_TEST > +export GIT_TEST_DISABLE_EOIE=true > > test_expect_success 'enable split index' ' > git config splitIndex.maxPercentChange 100 && > -- > 2.18.0.windows.1 > -- Duy