On Thu, 21 Feb 2008, Linus Torvalds wrote: > > I say "early" just because I haven't really gone through it more than > once, and testing that it passes all the tests. Ok, here's an improved version. This is a bit larger, but having now thought the problems through, I think this is *much* better. The thing is, we used to not be able to handle the case of removing a index entry and then re-inserting it, and we even had a rather ugly special case for that reason, where replace_index_entry() basically turned itself into a no-op if the new and the old entries were the same. But the merging code really wants to remove a set of entries and then replace them with a new one, and that whole logic needed the hash lists to be handled *properly*. So what this patch does is to not just have the UNHASHED bit, but a HASHED bit too, and when you insert an entry into the name hash, that involves: - clear the UNHASHED bit, because now it's valid again for lookup (which is really all that UNHASHED meant) - if we're being lazy, we're done here (but we still want to clear the UNHASHED bit regardless of lazy mode, since we can become unlazy later, and so we need the UNHASHED bit to always be set correctly, even if we never actually insert the entry into the hash list) - if it was already hashed, we just leave it on the list - otherwise mark it HASHED and insert it into the list this all means that unhashing and rehashing a name all just works automatically. Obviously, you cannot change the name of an entry (that would be a serious bug), but nothing can validly do that anyway (you'd have to allocate a new struct cache_entry anyway since the name length could change), so that's not a new limitation. The code actually gets simpler in many ways, although the lazy hashing does mean that there are a few odd cases (ie something can be marked unhashed even though it was never on the hash in the first place, and isn't actually marked hashed!). I haven't actually added any code to verify that the name hash state matches the index state, but I feel pretty good about this patch anyway: it passes my "it makes sense" filters in ways that the previous one didn't. Linus --- builtin-read-tree.c | 3 ++- cache.h | 35 ++++++++++++++++++++++++++++++++++- read-cache.c | 30 ++++++++++-------------------- unpack-trees.c | 2 +- 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/builtin-read-tree.c b/builtin-read-tree.c index 5785401..7bdc312 100644 --- a/builtin-read-tree.c +++ b/builtin-read-tree.c @@ -41,11 +41,12 @@ static int read_cache_unmerged(void) for (i = 0; i < active_nr; i++) { struct cache_entry *ce = active_cache[i]; if (ce_stage(ce)) { + remove_index_entry(ce); if (last && !strcmp(ce->name, last->name)) continue; cache_tree_invalidate_path(active_cache_tree, ce->name); last = ce; - ce->ce_flags |= CE_REMOVE; + continue; } *dst++ = ce; } diff --git a/cache.h b/cache.h index e1000bc..0ae33f4 100644 --- a/cache.h +++ b/cache.h @@ -133,7 +133,40 @@ struct cache_entry { #define CE_UPDATE (0x10000) #define CE_REMOVE (0x20000) #define CE_UPTODATE (0x40000) -#define CE_UNHASHED (0x80000) + +#define CE_HASHED (0x100000) +#define CE_UNHASHED (0x200000) + +/* + * Copy the sha1 and stat state of a cache entry from one to + * another. But we never change the name, or the hash state! + */ +#define CE_STATE_MASK (CE_HASHED | CE_UNHASHED) +static inline void copy_cache_entry(struct cache_entry *dst, struct cache_entry *src) +{ + struct cache_entry *next = dst->next; + unsigned int state = dst->ce_flags & CE_STATE_MASK; + + memcpy(dst, src, offsetof(struct cache_entry, name)); + + /* Restore the hash state */ + dst->next = next; + dst->ce_flags = (dst->ce_flags & ~CE_STATE_MASK) | state; +} + +/* + * We don't actually *remove* it, we can just mark it invalid so that + * we won't find it in lookups. + * + * Not only would we have to search the lists (simple enough), but + * we'd also have to rehash other hash buckets in case this makes the + * hash bucket empty (common). So it's much better to just mark + * it. + */ +static inline void remove_index_entry(struct cache_entry *ce) +{ + ce->ce_flags |= CE_UNHASHED; +} static inline unsigned create_ce_flags(size_t len, unsigned stage) { diff --git a/read-cache.c b/read-cache.c index e45f4b3..fee0c80 100644 --- a/read-cache.c +++ b/read-cache.c @@ -37,8 +37,13 @@ static unsigned int hash_name(const char *name, int namelen) static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) { void **pos; - unsigned int hash = hash_name(ce->name, ce_namelen(ce)); + unsigned int hash; + if (ce->ce_flags & CE_HASHED) + return; + ce->ce_flags |= CE_HASHED; + ce->next = NULL; + hash = hash_name(ce->name, ce_namelen(ce)); pos = insert_hash(hash, ce, &istate->name_hash); if (pos) { ce->next = *pos; @@ -59,33 +64,18 @@ static void lazy_init_name_hash(struct index_state *istate) static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) { + ce->ce_flags &= ~CE_UNHASHED; istate->cache[nr] = ce; if (istate->name_hash_initialized) hash_index_entry(istate, ce); } -/* - * We don't actually *remove* it, we can just mark it invalid so that - * we won't find it in lookups. - * - * Not only would we have to search the lists (simple enough), but - * we'd also have to rehash other hash buckets in case this makes the - * hash bucket empty (common). So it's much better to just mark - * it. - */ -static void remove_hash_entry(struct index_state *istate, struct cache_entry *ce) -{ - ce->ce_flags |= CE_UNHASHED; -} - static void replace_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) { struct cache_entry *old = istate->cache[nr]; - if (ce != old) { - remove_hash_entry(istate, old); - set_index_entry(istate, nr, ce); - } + remove_index_entry(old); + set_index_entry(istate, nr, ce); istate->cache_changed = 1; } @@ -413,7 +403,7 @@ int remove_index_entry_at(struct index_state *istate, int pos) { struct cache_entry *ce = istate->cache[pos]; - remove_hash_entry(istate, ce); + remove_index_entry(ce); istate->cache_changed = 1; istate->cache_nr--; if (pos >= istate->cache_nr) diff --git a/unpack-trees.c b/unpack-trees.c index ec558f9..07c4c28 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -590,7 +590,7 @@ static int merged_entry(struct cache_entry *merge, struct cache_entry *old, * a match. */ if (same(old, merge)) { - memcpy(merge, old, offsetof(struct cache_entry, name)); + copy_cache_entry(merge, old); } else { verify_uptodate(old, o); invalidate_ce_path(old); - To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html