On Fri, Sep 30, 2016 at 05:19:37PM -0700, Junio C Hamano wrote: > Introduce a mechanism, where we estimate the number of objects in > the repository upon the first request to abbreviate an object name > with the default setting and come up with a sane default for the > repository. Based on the expectation that we would see collision in > a repository with 2^(2N) objects when using object names shortened > to first N bits, use sufficient number of hexdigits to cover the > number of objects in the repository. Each hexdigit (4-bits) we add > to the shortened name allows us to have four times (2-bits) as many > objects in the repository. > > --- > cache.h | 1 + > environment.c | 2 +- > sha1_name.c | 28 +++++++++++++++++++++++++++- > 3 files changed, 29 insertions(+), 2 deletions(-) For reference, here's a working version that just uses a separate counting function (no commit message, because I would just steal the one from Linus ;) ). --- cache.h | 6 ++++++ environment.c | 2 +- sha1_file.c | 27 +++++++++++++++++++++++++++ sha1_name.c | 20 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/cache.h b/cache.h index 5a651b8..f22ace5 100644 --- a/cache.h +++ b/cache.h @@ -1455,6 +1455,12 @@ extern void prepare_packed_git(void); extern void reprepare_packed_git(void); extern void install_packed_git(struct packed_git *pack); +/* + * Give a rough count of objects in the repository. This sacrifices accuracy + * for speed. + */ +unsigned long approximate_object_count(void); + extern struct packed_git *find_sha1_pack(const unsigned char *sha1, struct packed_git *packs); diff --git a/environment.c b/environment.c index 44fb107..6f9d290 100644 --- a/environment.c +++ b/environment.c @@ -16,7 +16,7 @@ int trust_executable_bit = 1; int trust_ctime = 1; int check_stat = 1; int has_symlinks = 1; -int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV; +int minimum_abbrev = 4, default_abbrev = -1; int ignore_case; int assume_unchanged; int prefer_symlink_refs; diff --git a/sha1_file.c b/sha1_file.c index b9c1fa3..4882440 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1381,6 +1381,32 @@ static void prepare_packed_git_one(char *objdir, int local) strbuf_release(&path); } +static int approximate_object_count_valid; + +/* + * Give a fast, rough count of the number of objects in the repository. This + * ignores loose objects completely. If you have a lot of them, then either + * you should repack because your performance will be awful, or they are + * all unreachable objects about to be pruned, in which case they're not really + * interesting as a measure of repo size in the first place. + */ +unsigned long approximate_object_count(void) +{ + static unsigned long count; + if (!approximate_object_count_valid) { + struct packed_git *p; + + prepare_packed_git(); + count = 0; + for (p = packed_git; p; p = p->next) { + if (open_pack_index(p)) + continue; + count += p->num_objects; + } + } + return count; +} + static void *get_next_packed_git(const void *p) { return ((const struct packed_git *)p)->next; @@ -1455,6 +1481,7 @@ void prepare_packed_git(void) void reprepare_packed_git(void) { + approximate_object_count_valid = 0; prepare_packed_git_run_once = 0; prepare_packed_git(); } diff --git a/sha1_name.c b/sha1_name.c index 3b647fd..ecc4b54 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -455,10 +455,30 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn fn, void *cb_data) return ret; } +/* + * Return the slot of the most-significant bit set in "val". There are various + * ways to do this quickly with fls() or __builtin_clzl(), but speed is + * probably not a big deal here. + */ +unsigned msb(unsigned long val) +{ + unsigned r = 0; + while (val >>= 1) + r++; + return r; +} + int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len) { int status, exists; + if (len < 0) { + unsigned long count = approximate_object_count(); + len = (msb(count) + 1) / 2; + if (len < 0) + len = FALLBACK_DEFAULT_ABBREV; + } + sha1_to_hex_r(hex, sha1); if (len == 40 || !len) return 40; -- 2.10.0.618.g82cc264