[PATCH v3 12/13] name-hash: allow to lookup a name with precalculated base hash

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



index_name_exists_base() differs from index_name_exists() in how the
hash is calculated. It uses the base hash as the seed and skips the
baselen part.

The intention is to reduce hashing cost during directory traversal,
where the hash of the directory is calculated once, and used as base
hash for all entries inside.

This poses a constraint in hash function. The function has not changed
since 2008. With luck it'll never change again. If resuming hashing at
any characters are not feasible with a new (better) hash function,
maybe we could stop at directory boundary.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 Makefile          |  1 +
 cache.h           |  2 --
 dir.c             |  1 +
 name-hash.c       | 48 ++++++++++++++++++++++--------------------------
 name-hash.h (new) | 45 +++++++++++++++++++++++++++++++++++++++++++++
 read-cache.c      |  1 +
 unpack-trees.c    |  1 +
 7 files changed, 71 insertions(+), 28 deletions(-)
 create mode 100644 name-hash.h

diff --git a/Makefile b/Makefile
index 26d3332..8d753af 100644
--- a/Makefile
+++ b/Makefile
@@ -690,6 +690,7 @@ LIB_H += mailmap.h
 LIB_H += merge-blobs.h
 LIB_H += merge-recursive.h
 LIB_H += mergesort.h
+LIB_H += name-hash.h
 LIB_H += notes-cache.h
 LIB_H += notes-merge.h
 LIB_H += notes.h
diff --git a/cache.h b/cache.h
index e493563..cf33c67 100644
--- a/cache.h
+++ b/cache.h
@@ -313,7 +313,6 @@ static inline void remove_name_hash(struct cache_entry *ce)
 #define refresh_cache(flags) refresh_index(&the_index, (flags), NULL, NULL, NULL)
 #define ce_match_stat(ce, st, options) ie_match_stat(&the_index, (ce), (st), (options))
 #define ce_modified(ce, st, options) ie_modified(&the_index, (ce), (st), (options))
-#define cache_name_exists(name, namelen, igncase) index_name_exists(&the_index, (name), (namelen), (igncase))
 #define cache_name_is_other(name, namelen) index_name_is_other(&the_index, (name), (namelen))
 #define resolve_undo_clear() resolve_undo_clear_index(&the_index)
 #define unmerge_cache_entry_at(at) unmerge_index_entry_at(&the_index, at)
@@ -442,7 +441,6 @@ extern int write_index(struct index_state *, int newfd);
 extern int discard_index(struct index_state *);
 extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
-extern struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int igncase);
 extern int index_name_pos(const struct index_state *, const char *name, int namelen);
 #define ADD_CACHE_OK_TO_ADD 1		/* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2	/* Ok to replace file/directory */
diff --git a/dir.c b/dir.c
index 6809dd2..5fda5af 100644
--- a/dir.c
+++ b/dir.c
@@ -11,6 +11,7 @@
 #include "dir.h"
 #include "refs.h"
 #include "wildmatch.h"
+#include "name-hash.h"
 
 #ifdef MEASURE_EXCLUDE
 static uint32_t tv_treat_leading_path;
diff --git a/name-hash.c b/name-hash.c
index 1287a19..572d232 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,30 +7,7 @@
  */
 #define NO_THE_INDEX_COMPATIBILITY_MACROS
 #include "cache.h"
-
-/*
- * This removes bit 5 if bit 6 is set.
- *
- * That will make US-ASCII characters hash to their upper-case
- * equivalent. We could easily do this one whole word at a time,
- * but that's for future worries.
- */
-static inline unsigned char icase_hash(unsigned char c)
-{
-	return c & ~((c & 0x40) >> 1);
-}
-
-static unsigned int hash_name(const char *name, int namelen)
-{
-	unsigned int hash = 0x123;
-
-	while (namelen--) {
-		unsigned char c = *name++;
-		c = icase_hash(c);
-		hash = hash*101 + c;
-	}
-	return hash;
-}
+#include "name-hash.h"
 
 static void hash_index_entry_directories(struct index_state *istate, struct cache_entry *ce)
 {
@@ -152,9 +129,11 @@ static int same_name(const struct cache_entry *ce, const char *name, int namelen
 	return slow_same_name(name, namelen, ce->name, namelen < len ? namelen : len);
 }
 
-struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int icase)
+static inline struct cache_entry *find_entry_by_hash(struct index_state *istate,
+						     const char *name, int namelen,
+						     unsigned int hash,
+						     int icase)
 {
-	unsigned int hash = hash_name(name, namelen);
 	struct cache_entry *ce;
 
 	lazy_init_name_hash(istate);
@@ -189,3 +168,20 @@ struct cache_entry *index_name_exists(struct index_state *istate, const char *na
 	}
 	return NULL;
 }
+
+struct cache_entry *index_name_exists(struct index_state *istate,
+				      const char *name, int namelen,
+				      int icase)
+{
+	unsigned int hash = hash_name(name, namelen);
+	return find_entry_by_hash(istate, name, namelen, hash, icase);
+}
+
+struct cache_entry *index_name_exists_base(struct index_state *istate,
+					   unsigned int basehash, int baselen,
+					   const char *name, int namelen,
+					   int icase)
+{
+	unsigned int hash = hash_name_from(basehash, name + baselen, namelen - baselen);
+	return find_entry_by_hash(istate, name, namelen, hash, icase);
+}
diff --git a/name-hash.h b/name-hash.h
new file mode 100644
index 0000000..997de30
--- /dev/null
+++ b/name-hash.h
@@ -0,0 +1,45 @@
+#ifndef NAME_HASH_H
+#define NAME_HASH_H
+
+extern struct cache_entry *index_name_exists(struct index_state *istate,
+					     const char *name, int namelen,
+					     int igncase);
+extern struct cache_entry *index_name_exists_base(struct index_state *istate,
+						  unsigned int basehash, int baselen,
+						  const char *name, int namelen,
+						  int igncase);
+
+static inline struct cache_entry *cache_name_exists(const char *name, int namelen, int igncase)
+{
+	return index_name_exists(&the_index, name, namelen, igncase);
+}
+
+/*
+ * This removes bit 5 if bit 6 is set.
+ *
+ * That will make US-ASCII characters hash to their upper-case
+ * equivalent. We could easily do this one whole word at a time,
+ * but that's for future worries.
+ */
+static inline unsigned char icase_hash(unsigned char c)
+{
+	return c & ~((c & 0x40) >> 1);
+}
+
+static inline unsigned int hash_name_from(unsigned int hash,
+					  const char *name, int namelen)
+{
+	while (namelen--) {
+		unsigned char c = *name++;
+		c = icase_hash(c);
+		hash = hash*101 + c;
+	}
+	return hash;
+}
+
+static inline unsigned int hash_name(const char *name, int namelen)
+{
+	return hash_name_from(0x123, name, namelen);
+}
+
+#endif
diff --git a/read-cache.c b/read-cache.c
index 827ae55..8bd3ec8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -14,6 +14,7 @@
 #include "resolve-undo.h"
 #include "strbuf.h"
 #include "varint.h"
+#include "name-hash.h"
 
 static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really);
 
diff --git a/unpack-trees.c b/unpack-trees.c
index 09e53df..60fa5d4 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -8,6 +8,7 @@
 #include "progress.h"
 #include "refs.h"
 #include "attr.h"
+#include "name-hash.h"
 
 /*
  * Error messages expected by scripts out of plumbing commands such as
-- 
1.8.1.2.536.gf441e6d

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]