On 29.07.2014 16:43, Patrick Reynolds wrote: > Remotes are stored as an array, so looking one up or adding one without > duplication is an O(n) operation. Reading an entire config file full of > remotes is O(n^2) in the number of remotes. For a repository with tens of > thousands of remotes, the running time can hit multiple minutes. > > Hash tables are way faster. So we add a hashmap from remote name to > struct remote and use it for all lookups. The time to add a new remote to > a repo that already has 50,000 remotes drops from ~2 minutes to < 1 > second. > > We retain the old array of remotes so iterators proceed in config-file > order. > > Signed-off-by: Patrick Reynolds <patrick.reynolds@xxxxxxxxxx> > --- > mark function with no arguments as taking void > > remote.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++----------------- > remote.h | 3 +++ > 2 files changed, 49 insertions(+), 17 deletions(-) > > diff --git a/remote.c b/remote.c > index a0701f6..52533e4 100644 > --- a/remote.c > +++ b/remote.c > @@ -42,6 +42,7 @@ struct rewrites { > static struct remote **remotes; > static int remotes_alloc; > static int remotes_nr; > +static struct hashmap remotes_hash; > > static struct branch **branches; > static int branches_alloc; > @@ -136,26 +137,51 @@ static void add_url_alias(struct remote *remote, const char *url) > add_pushurl_alias(remote, url); > } > > +struct remotes_hash_key { > + const char *str; > + int len; > +}; > + > +static int remotes_hash_cmp(const struct remote *a, const struct remote *b, const struct remotes_hash_key *key) > +{ > + if (key) > + return strncmp(a->name, key->str, key->len) || a->name[key->len]; > + else > + return strcmp(a->name, b->name); > +} > + > +static inline void init_remotes_hash(void) > +{ > + if (!remotes_hash.cmpfn) > + hashmap_init(&remotes_hash, (hashmap_cmp_fn)remotes_hash_cmp, 0); > +} > + > static struct remote *make_remote(const char *name, int len) > { > - struct remote *ret; > - int i; > + struct remote *ret, *replaced; > + struct remotes_hash_key lookup; > + struct hashmap_entry lookup_entry; > > - for (i = 0; i < remotes_nr; i++) { > - if (len ? (!strncmp(name, remotes[i]->name, len) && > - !remotes[i]->name[len]) : > - !strcmp(name, remotes[i]->name)) > - return remotes[i]; > - } > + if (!len) > + len = strlen(name); > + > + init_remotes_hash(); > + lookup.str = name; > + lookup.len = len; > + hashmap_entry_init(&lookup_entry, memhash(name, len)); > + > + if ((ret = hashmap_get(&remotes_hash, &lookup_entry, &lookup)) != NULL) > + return ret; > > ret = xcalloc(1, sizeof(struct remote)); > ret->prune = -1; /* unspecified */ > ALLOC_GROW(remotes, remotes_nr + 1, remotes_alloc); > remotes[remotes_nr++] = ret; > - if (len) > - ret->name = xstrndup(name, len); > - else > - ret->name = xstrdup(name); > + ret->name = xstrndup(name, len); > + > + hashmap_entry_init(ret, lookup_entry.hash); > + replaced = hashmap_put(&remotes_hash, ret); > + assert(replaced == NULL); /* no previous entry overwritten */ > return ret; > } > > @@ -722,13 +748,16 @@ struct remote *pushremote_get(const char *name) > > int remote_is_configured(const char *name) > { > - int i; > + struct remotes_hash_key lookup; > + struct hashmap_entry lookup_entry; > read_config(); > > - for (i = 0; i < remotes_nr; i++) > - if (!strcmp(name, remotes[i]->name)) > - return 1; > - return 0; > + init_remotes_hash(); > + lookup.str = name; > + lookup.len = strlen(name); > + hashmap_entry_init(&lookup_entry, memhash(name, lookup.len)); > + > + return hashmap_get(&remotes_hash, &lookup_entry, &lookup) != NULL; > } > > int for_each_remote(each_remote_fn fn, void *priv) > diff --git a/remote.h b/remote.h > index 917d383..81cb5ff 100644 > --- a/remote.h > +++ b/remote.h > @@ -1,6 +1,7 @@ > #ifndef REMOTE_H > #define REMOTE_H > > +#include "hashmap.h" > #include "parse-options.h" > > enum { > @@ -10,6 +11,8 @@ enum { > }; > > struct remote { > + struct hashmap_entry ent; /* must be first */ > + I stumbled about this comment "/* must be first */" when reading the changelog. Why does it need to be first? Is it a common reason I'm just not aware of, or would it make sense to put the reason into the comment as well? Thanks, Stefan > const char *name; > int origin; > > -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html