Hi Gábor, On Sat, 24 Aug 2019, SZEDER Gábor wrote: > On Tue, Jul 16, 2019 at 07:58:42AM -0700, Slavica Djukic via GitGitGadget wrote: > > In the `git add -i` command, we show unique prefixes of the commands and > > files, to give an indication what prefix would select them. > > > > Naturally, the C implementation looks a lot different than the Perl > > implementation: in Perl, a trie is much easier implemented, while we > > already have a pretty neat hashmap implementation in C that we use for > > the purpose of storing (not necessarily unique) prefixes. > > > > The idea: for each item that we add, we generate prefixes starting with > > the first letter, then the first two letters, then three, etc, until we > > find a prefix that is unique (or until the prefix length would be > > longer than we want). If we encounter a previously-unique prefix on the > > way, we adjust that item's prefix to make it unique again (or we mark it > > as having no unique prefix if we failed to find one). These partial > > prefixes are stored in a hash map (for quick lookup times). > > > > To make sure that this function works as expected, we add a test using a > > special-purpose test helper that was added for that purpose. > > > > Note: We expect the list of prefix items to be passed in as a list of > > pointers rather than as regular list to avoid having to copy information > > (the actual items will most likely contain more information than just > > the name and the length of the unique prefix, but passing in `struct > > prefix_item *` would not allow for that). > > > diff --git a/prefix-map.c b/prefix-map.c > > new file mode 100644 > > index 0000000000..747ddb4ebc > > --- /dev/null > > +++ b/prefix-map.c > > @@ -0,0 +1,109 @@ > > +#include "cache.h" > > +#include "prefix-map.h" > > + > > +static int map_cmp(const void *unused_cmp_data, > > + const void *entry, > > + const void *entry_or_key, > > + const void *unused_keydata) > > +{ > > + const struct prefix_map_entry *a = entry; > > + const struct prefix_map_entry *b = entry_or_key; > > + > > + return a->prefix_length != b->prefix_length || > > + strncmp(a->name, b->name, a->prefix_length); > > +} > > + > > +static void add_prefix_entry(struct hashmap *map, const char *name, > > + size_t prefix_length, struct prefix_item *item) > > +{ > > + struct prefix_map_entry *result = xmalloc(sizeof(*result)); > > + result->name = name; > > + result->prefix_length = prefix_length; > > + result->item = item; > > + hashmap_entry_init(result, memhash(name, prefix_length)); > > + hashmap_add(map, result); > > +} > > + > > +static void init_prefix_map(struct prefix_map *prefix_map, > > + int min_prefix_length, int max_prefix_length) > > +{ > > + hashmap_init(&prefix_map->map, map_cmp, NULL, 0); > > + prefix_map->min_length = min_prefix_length; > > + prefix_map->max_length = max_prefix_length; > > +} > > + > > +static void add_prefix_item(struct prefix_map *prefix_map, > > + struct prefix_item *item) > > +{ > > + struct prefix_map_entry e = { { NULL } }, *e2; > > + int j; > > + > > + e.item = item; > > + e.name = item->name; > > + > > + for (j = prefix_map->min_length; > > + j <= prefix_map->max_length && e.name[j]; j++) { > > + /* Avoid breaking UTF-8 multi-byte sequences */ > > + if (!isascii(e.name[j])) > > + break; > > + > > + e.prefix_length = j; > > + hashmap_entry_init(&e, memhash(e.name, j)); > > + e2 = hashmap_get(&prefix_map->map, &e, NULL); > > + if (!e2) { > > + /* prefix is unique at this stage */ > > + item->prefix_length = j; > > + add_prefix_entry(&prefix_map->map, e.name, j, item); > > + break; > > + } > > + > > + if (!e2->item) > > + continue; /* non-unique prefix */ > > + > > + if (j != e2->item->prefix_length || memcmp(e.name, e2->name, j)) > > + BUG("unexpected prefix length: %d != %d (%s != %s)", > > + j, (int)e2->item->prefix_length, e.name, e2->name); > > + > > + /* skip common prefix */ > > + for (; j < prefix_map->max_length && e.name[j]; j++) { > > + if (e.item->name[j] != e2->item->name[j]) > > + break; > > + add_prefix_entry(&prefix_map->map, e.name, j + 1, > > + NULL); > > + } > > + > > + /* e2 no longer refers to a unique prefix */ > > + if (j < prefix_map->max_length && e2->name[j]) { > > + /* found a new unique prefix for e2's item */ > > + e2->item->prefix_length = j + 1; > > + add_prefix_entry(&prefix_map->map, e2->name, j + 1, > > + e2->item); > > + } > > + else > > + e2->item->prefix_length = 0; > > + e2->item = NULL; > > + > > + if (j < prefix_map->max_length && e.name[j]) { > > + /* found a unique prefix for the item */ > > + e.item->prefix_length = j + 1; > > + add_prefix_entry(&prefix_map->map, e.name, j + 1, > > + e.item); > > + } else > > + /* item has no (short enough) unique prefix */ > > + e.item->prefix_length = 0; > > + > > + break; > > + } > > +} > > + > > +void find_unique_prefixes(struct prefix_item **list, size_t nr, > > + int min_length, int max_length) > > +{ > > + int i; > > + struct prefix_map prefix_map; > > + > > + init_prefix_map(&prefix_map, min_length, max_length); > > + for (i = 0; i < nr; i++) > > + add_prefix_item(&prefix_map, list[i]); > > + hashmap_free(&prefix_map.map, 1); > > +} > > Between the commit message, the in-code comment, the names of the new > files, and implementation I was left somewhat confused about what this > is about and how it works. TBH, I didn't even try to understand how > all the above works, in particular the add_prefix_item() function. Let me try to explain it here, and maybe you can help me by suggesting an improved commit message and/or code comments? The problem is this: given a set of items with labels (e.g. file names), find, for each item, the unique prefix that identifies it. Example: given the files `hello.txt`, `heaven.txt` and `hell.txt`, the items' unique prefixes would be `hello`, `hea` and `hell.`, respectively. In `git add -i`, we actually only want to allow alphanumerical prefixes, and we also want at least one, and at most three characters, so only the second item would have an admissible unique prefix: `hea`. > However, I think it would be much-much simpler to first sort (a copy > of?) the array of prefix item pointers based on their 'name' field, > and then look for a unique prefix in each neighboring pair. Perhaps > it would even be faster, because it doesn't have to allocate a bunch > of hashmap items, though I don't think that it matters much in > practice (i.e. I expect the number of items to be fairly small; > presumably nobody will run interactive add after a mass refactoring > modifying thousands of files). The time complexity of the sorted list would be O(n*log(n)), while the hashmap-based complexity would be an amortized O(n). And yes, you would not _want_ to run interactive add after a mass refactoring. But it happens. It happens to me more times than I care to admit. And you know what? I really appreciate that even the Perl version is relatively snappy in those circumstances. > > diff --git a/prefix-map.h b/prefix-map.h > > new file mode 100644 > > index 0000000000..ce3b8a4a32 > > --- /dev/null > > +++ b/prefix-map.h > > @@ -0,0 +1,40 @@ > > +#ifndef PREFIX_MAP_H > > +#define PREFIX_MAP_H > > > > +#include "hashmap.h" > > + > > +struct prefix_item { > > + const char *name; > > + size_t prefix_length; > > +}; > > This struct is part of find_unique_prefixes()'s signature, good. > > > +struct prefix_map_entry { > > + struct hashmap_entry e; > > + const char *name; > > + size_t prefix_length; > > + /* if item is NULL, the prefix is not unique */ > > + struct prefix_item *item; > > +}; > > + > > +struct prefix_map { > > + struct hashmap map; > > + int min_length, max_length; > > +}; > > However, neither of these two structs nor the hashmap appear in the > function's signature, but are all implementation details. Therefore, > they should not be defined and included here in the header but in the > .c source file. (But as mentioned above, I think this could be > implemented much simpler without these data structures.) Right you are! > Furthermore, this is not a map. > A map, in general, is a container of key-value pairs that allows > efficient insertion, removal and lookup. This so-called prefix_map > does none of that, so it should not be called a map. What would you call it instead? (I went with "map" because the underlying data structure is a "hash map", I know, not the best argument, but I failed to find a better name...) I also have to admit that I thought that I could fix the design where `git-add--interactive.perl` creates this trie, but then still performs a linear search when searching by prefix. That seems not to be possible, though, as the unique prefixes are limited to certain character ranges, while the lookup-by-prefix is not limited in that way. > > +/* > > + * Find unique prefixes in a given list of strings. > > ... and stores the length of the unique prefixes in the > 'prefix_length' field of the elements of the given array. Good idea. I changed it to also explain what is meant by "unique prefix": * Given a list of names, find unique prefixes (i.e. the first <n> characters * that uniquely identify the names) and store the lengths of the unique * prefixes in the 'prefix_length' field of the elements of the given array.. > > + * > > + * Typically, the `struct prefix_item` information will be but a field in the > > s/but //, perhaps? Sure. I am relatively certain that it is correct grammar, but it is probably a good idea to remove it. > > + * actual item struct; For this reason, the `list` parameter is specified as a > > + * list of pointers to the items. > > + * > > + * The `min_length`/`max_length` parameters define what length the unique > > + * prefixes should have. > > + * > > + * If no unique prefix could be found for a given item, its `prefix_length` > > + * will be set to 0. > > + */ > > +void find_unique_prefixes(struct prefix_item **list, size_t nr, > > The first argument is not a list but an array. Indeed. > > + int min_length, int max_length); > > size_t, perhaps? These are closely related to > 'prefix_item.prefix_length', which is already (rightfully) size_t. True. > > +#endif > > diff --git a/t/helper/test-prefix-map.c b/t/helper/test-prefix-map.c > > new file mode 100644 > > index 0000000000..3f1c90eaf0 > > --- /dev/null > > +++ b/t/helper/test-prefix-map.c > > @@ -0,0 +1,58 @@ > > +#include "test-tool.h" > > +#include "cache.h" > > +#include "prefix-map.h" > > + > > +static size_t test_count, failed_count; > > + > > +static void check(int succeeded, const char *file, size_t line_no, > > + const char *fmt, ...) > > +{ > > + va_list ap; > > + > > + test_count++; > > + if (succeeded) > > + return; > > + > > + va_start(ap, fmt); > > + fprintf(stderr, "%s:%d: ", file, (int)line_no); > > + vfprintf(stderr, fmt, ap); > > + fputc('\n', stderr); > > + va_end(ap); > > + > > + failed_count++; > > +} > > + > > +#define EXPECT_SIZE_T_EQUALS(expect, actual, hint) \ > > + check(expect == actual, __FILE__, __LINE__, \ > > + "size_t's do not match: %" \ > > + PRIdMAX " != %" PRIdMAX " (%s) (%s)", \ > > + (intmax_t)expect, (intmax_t)actual, #actual, hint) > > + > > +int cmd__prefix_map(int argc, const char **argv) > > +{ > > +#define NR 5 > > + struct prefix_item items[NR] = { > > You don't have to tell the compiler how many elements this array will > contain, it will figure that out on its own. > > > + { "unique" }, > > + { "hell" }, > > + { "hello" }, > > + { "wok" }, > > + { "world" }, > > + }; > > + struct prefix_item *list[NR] = { > > Likewise. That is correct. What the compiler _cannot_ figure out, on its own, however, is that `items` and `list` _need_ to contain the same number of items. Hence the need for `NR`. > > + items, items + 1, items + 2, items + 3, items + 4 > > + }; > > + > > + find_unique_prefixes(list, NR, 1, 3); > > This could be find_unique_prefixes(list, ARRAY_SIZE(list), 1, 3), and > then there is no need for that NR macro anymore. > > > + > > +#define EXPECT_PREFIX_LENGTH_EQUALS(expect, index) \ > > + EXPECT_SIZE_T_EQUALS(expect, list[index]->prefix_length, \ > > + list[index]->name) > > + > > + EXPECT_PREFIX_LENGTH_EQUALS(1, 0); > > + EXPECT_PREFIX_LENGTH_EQUALS(0, 1); > > + EXPECT_PREFIX_LENGTH_EQUALS(0, 2); > > + EXPECT_PREFIX_LENGTH_EQUALS(3, 3); > > + EXPECT_PREFIX_LENGTH_EQUALS(3, 4); > > + > > + return !!failed_count; > > +} Thank you for your review! Dscho