Re: [PATCH 3/3] abbrev: auto size the default abbreviation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Sep 30, 2016 at 05:19:37PM -0700, Junio C Hamano wrote:

> Introduce a mechanism, where we estimate the number of objects in
> the repository upon the first request to abbreviate an object name
> with the default setting and come up with a sane default for the
> repository.  Based on the expectation that we would see collision in
> a repository with 2^(2N) objects when using object names shortened
> to first N bits, use sufficient number of hexdigits to cover the
> number of objects in the repository.  Each hexdigit (4-bits) we add
> to the shortened name allows us to have four times (2-bits) as many
> objects in the repository.
> 
> ---
>  cache.h       |  1 +
>  environment.c |  2 +-
>  sha1_name.c   | 28 +++++++++++++++++++++++++++-
>  3 files changed, 29 insertions(+), 2 deletions(-)

For reference, here's a working version that just uses a separate
counting function (no commit message, because I would just steal the one
from Linus ;) ).

---
 cache.h       |  6 ++++++
 environment.c |  2 +-
 sha1_file.c   | 27 +++++++++++++++++++++++++++
 sha1_name.c   | 20 ++++++++++++++++++++
 4 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/cache.h b/cache.h
index 5a651b8..f22ace5 100644
--- a/cache.h
+++ b/cache.h
@@ -1455,6 +1455,12 @@ extern void prepare_packed_git(void);
 extern void reprepare_packed_git(void);
 extern void install_packed_git(struct packed_git *pack);
 
+/*
+ * Give a rough count of objects in the repository. This sacrifices accuracy
+ * for speed.
+ */
+unsigned long approximate_object_count(void);
+
 extern struct packed_git *find_sha1_pack(const unsigned char *sha1,
 					 struct packed_git *packs);
 
diff --git a/environment.c b/environment.c
index 44fb107..6f9d290 100644
--- a/environment.c
+++ b/environment.c
@@ -16,7 +16,7 @@ int trust_executable_bit = 1;
 int trust_ctime = 1;
 int check_stat = 1;
 int has_symlinks = 1;
-int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV;
+int minimum_abbrev = 4, default_abbrev = -1;
 int ignore_case;
 int assume_unchanged;
 int prefer_symlink_refs;
diff --git a/sha1_file.c b/sha1_file.c
index b9c1fa3..4882440 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1381,6 +1381,32 @@ static void prepare_packed_git_one(char *objdir, int local)
 	strbuf_release(&path);
 }
 
+static int approximate_object_count_valid;
+
+/*
+ * Give a fast, rough count of the number of objects in the repository. This
+ * ignores loose objects completely. If you have a lot of them, then either
+ * you should repack because your performance will be awful, or they are
+ * all unreachable objects about to be pruned, in which case they're not really
+ * interesting as a measure of repo size in the first place.
+ */
+unsigned long approximate_object_count(void)
+{
+	static unsigned long count;
+	if (!approximate_object_count_valid) {
+		struct packed_git *p;
+
+		prepare_packed_git();
+		count = 0;
+		for (p = packed_git; p; p = p->next) {
+			if (open_pack_index(p))
+				continue;
+			count += p->num_objects;
+		}
+	}
+	return count;
+}
+
 static void *get_next_packed_git(const void *p)
 {
 	return ((const struct packed_git *)p)->next;
@@ -1455,6 +1481,7 @@ void prepare_packed_git(void)
 
 void reprepare_packed_git(void)
 {
+	approximate_object_count_valid = 0;
 	prepare_packed_git_run_once = 0;
 	prepare_packed_git();
 }
diff --git a/sha1_name.c b/sha1_name.c
index 3b647fd..ecc4b54 100644
--- a/sha1_name.c
+++ b/sha1_name.c
@@ -455,10 +455,30 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn fn, void *cb_data)
 	return ret;
 }
 
+/*
+ * Return the slot of the most-significant bit set in "val". There are various
+ * ways to do this quickly with fls() or __builtin_clzl(), but speed is
+ * probably not a big deal here.
+ */
+unsigned msb(unsigned long val)
+{
+	unsigned r = 0;
+	while (val >>= 1)
+		r++;
+	return r;
+}
+
 int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len)
 {
 	int status, exists;
 
+	if (len < 0) {
+		unsigned long count = approximate_object_count();
+		len = (msb(count) + 1) / 2;
+		if (len < 0)
+			len = FALLBACK_DEFAULT_ABBREV;
+	}
+
 	sha1_to_hex_r(hex, sha1);
 	if (len == 40 || !len)
 		return 40;
-- 
2.10.0.618.g82cc264




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]