Re: [PATCH v3 07/11] Add a function to determine unique prefixes for a list of strings

Johannes Schindelin <Johannes.Schindelin@xxxxxx> · Tue, 27 Aug 2019 14:14:06 +0200 (CEST)

Hi Gábor,

On Sat, 24 Aug 2019, SZEDER Gábor wrote:

> On Tue, Jul 16, 2019 at 07:58:42AM -0700, Slavica Djukic via GitGitGadget wrote:
> > In the `git add -i` command, we show unique prefixes of the commands and
> > files, to give an indication what prefix would select them.
> >
> > Naturally, the C implementation looks a lot different than the Perl
> > implementation: in Perl, a trie is much easier implemented, while we
> > already have a pretty neat hashmap implementation in C that we use for
> > the purpose of storing (not necessarily unique) prefixes.
> >
> > The idea: for each item that we add, we generate prefixes starting with
> > the first letter, then the first two letters, then three, etc, until we
> > find a prefix that is unique (or until the prefix length would be
> > longer than we want). If we encounter a previously-unique prefix on the
> > way, we adjust that item's prefix to make it unique again (or we mark it
> > as having no unique prefix if we failed to find one). These partial
> > prefixes are stored in a hash map (for quick lookup times).
> >
> > To make sure that this function works as expected, we add a test using a
> > special-purpose test helper that was added for that purpose.
> >
> > Note: We expect the list of prefix items to be passed in as a list of
> > pointers rather than as regular list to avoid having to copy information
> > (the actual items will most likely contain more information than just
> > the name and the length of the unique prefix, but passing in `struct
> > prefix_item *` would not allow for that).
>
> > diff --git a/prefix-map.c b/prefix-map.c
> > new file mode 100644
> > index 0000000000..747ddb4ebc
> > --- /dev/null
> > +++ b/prefix-map.c
> > @@ -0,0 +1,109 @@
> > +#include "cache.h"
> > +#include "prefix-map.h"
> > +
> > +static int map_cmp(const void *unused_cmp_data,
> > +		   const void *entry,
> > +		   const void *entry_or_key,
> > +		   const void *unused_keydata)
> > +{
> > +	const struct prefix_map_entry *a = entry;
> > +	const struct prefix_map_entry *b = entry_or_key;
> > +
> > +	return a->prefix_length != b->prefix_length ||
> > +		strncmp(a->name, b->name, a->prefix_length);
> > +}
> > +
> > +static void add_prefix_entry(struct hashmap *map, const char *name,
> > +			     size_t prefix_length, struct prefix_item *item)
> > +{
> > +	struct prefix_map_entry *result = xmalloc(sizeof(*result));
> > +	result->name = name;
> > +	result->prefix_length = prefix_length;
> > +	result->item = item;
> > +	hashmap_entry_init(result, memhash(name, prefix_length));
> > +	hashmap_add(map, result);
> > +}
> > +
> > +static void init_prefix_map(struct prefix_map *prefix_map,
> > +			    int min_prefix_length, int max_prefix_length)
> > +{
> > +	hashmap_init(&prefix_map->map, map_cmp, NULL, 0);
> > +	prefix_map->min_length = min_prefix_length;
> > +	prefix_map->max_length = max_prefix_length;
> > +}
> > +
> > +static void add_prefix_item(struct prefix_map *prefix_map,
> > +			    struct prefix_item *item)
> > +{
> > +	struct prefix_map_entry e = { { NULL } }, *e2;
> > +	int j;
> > +
> > +	e.item = item;
> > +	e.name = item->name;
> > +
> > +	for (j = prefix_map->min_length;
> > +	     j <= prefix_map->max_length && e.name[j]; j++) {
> > +		/* Avoid breaking UTF-8 multi-byte sequences */
> > +		if (!isascii(e.name[j]))
> > +			break;
> > +
> > +		e.prefix_length = j;
> > +		hashmap_entry_init(&e, memhash(e.name, j));
> > +		e2 = hashmap_get(&prefix_map->map, &e, NULL);
> > +		if (!e2) {
> > +			/* prefix is unique at this stage */
> > +			item->prefix_length = j;
> > +			add_prefix_entry(&prefix_map->map, e.name, j, item);
> > +			break;
> > +		}
> > +
> > +		if (!e2->item)
> > +			continue; /* non-unique prefix */
> > +
> > +		if (j != e2->item->prefix_length || memcmp(e.name, e2->name, j))
> > +			BUG("unexpected prefix length: %d != %d (%s != %s)",
> > +			    j, (int)e2->item->prefix_length, e.name, e2->name);
> > +
> > +		/* skip common prefix */
> > +		for (; j < prefix_map->max_length && e.name[j]; j++) {
> > +			if (e.item->name[j] != e2->item->name[j])
> > +				break;
> > +			add_prefix_entry(&prefix_map->map, e.name, j + 1,
> > +					 NULL);
> > +		}
> > +
> > +		/* e2 no longer refers to a unique prefix */
> > +		if (j < prefix_map->max_length && e2->name[j]) {
> > +			/* found a new unique prefix for e2's item */
> > +			e2->item->prefix_length = j + 1;
> > +			add_prefix_entry(&prefix_map->map, e2->name, j + 1,
> > +					 e2->item);
> > +		}
> > +		else
> > +			e2->item->prefix_length = 0;
> > +		e2->item = NULL;
> > +
> > +		if (j < prefix_map->max_length && e.name[j]) {
> > +			/* found a unique prefix for the item */
> > +			e.item->prefix_length = j + 1;
> > +			add_prefix_entry(&prefix_map->map, e.name, j + 1,
> > +					 e.item);
> > +		} else
> > +			/* item has no (short enough) unique prefix */
> > +			e.item->prefix_length = 0;
> > +
> > +		break;
> > +	}
> > +}
> > +
> > +void find_unique_prefixes(struct prefix_item **list, size_t nr,
> > +			  int min_length, int max_length)
> > +{
> > +	int i;
> > +	struct prefix_map prefix_map;
> > +
> > +	init_prefix_map(&prefix_map, min_length, max_length);
> > +	for (i = 0; i < nr; i++)
> > +		add_prefix_item(&prefix_map, list[i]);
> > +	hashmap_free(&prefix_map.map, 1);
> > +}
>
> Between the commit message, the in-code comment, the names of the new
> files, and implementation I was left somewhat confused about what this
> is about and how it works.  TBH, I didn't even try to understand how
> all the above works, in particular the add_prefix_item() function.

Let me try to explain it here, and maybe you can help me by suggesting
an improved commit message and/or code comments?

The problem is this: given a set of items with labels (e.g. file names),
find, for each item, the unique prefix that identifies it. Example:
given the files `hello.txt`, `heaven.txt` and `hell.txt`, the items'
unique prefixes would be `hello`, `hea` and `hell.`, respectively.

In `git add -i`, we actually only want to allow alphanumerical prefixes,
and we also want at least one, and at most three characters, so only the
second item would have an admissible unique prefix: `hea`.

> However, I think it would be much-much simpler to first sort (a copy
> of?) the array of prefix item pointers based on their 'name' field,
> and then look for a unique prefix in each neighboring pair.  Perhaps
> it would even be faster, because it doesn't have to allocate a bunch
> of hashmap items, though I don't think that it matters much in
> practice (i.e. I expect the number of items to be fairly small;
> presumably nobody will run interactive add after a mass refactoring
> modifying thousands of files).

The time complexity of the sorted list would be O(n*log(n)), while the
hashmap-based complexity would be an amortized O(n).

And yes, you would not _want_ to run interactive add after a mass
refactoring. But it happens. It happens to me more times than I care to
admit. And you know what? I really appreciate that even the Perl version
is relatively snappy in those circumstances.

> > diff --git a/prefix-map.h b/prefix-map.h
> > new file mode 100644
> > index 0000000000..ce3b8a4a32
> > --- /dev/null
> > +++ b/prefix-map.h
> > @@ -0,0 +1,40 @@
> > +#ifndef PREFIX_MAP_H
> > +#define PREFIX_MAP_H
> >
> > +#include "hashmap.h"
> > +
> > +struct prefix_item {
> > +	const char *name;
> > +	size_t prefix_length;
> > +};
>
> This struct is part of find_unique_prefixes()'s signature, good.
>
> > +struct prefix_map_entry {
> > +	struct hashmap_entry e;
> > +	const char *name;
> > +	size_t prefix_length;
> > +	/* if item is NULL, the prefix is not unique */
> > +	struct prefix_item *item;
> > +};
> > +
> > +struct prefix_map {
> > +	struct hashmap map;
> > +	int min_length, max_length;
> > +};
>
> However, neither of these two structs nor the hashmap appear in the
> function's signature, but are all implementation details.  Therefore,
> they should not be defined and included here in the header but in the
> .c source file.  (But as mentioned above, I think this could be
> implemented much simpler without these data structures.)

Right you are!

> Furthermore, this is not a map.
> A map, in general, is a container of key-value pairs that allows
> efficient insertion, removal and lookup.  This so-called prefix_map
> does none of that, so it should not be called a map.

What would you call it instead?

(I went with "map" because the underlying data structure is a "hash
map", I know, not the best argument, but I failed to find a better
name...)

I also have to admit that I thought that I could fix the design where
`git-add--interactive.perl` creates this trie, but then still performs a
linear search when searching by prefix. That seems not to be possible,
though, as the unique prefixes are limited to certain character ranges,
while the lookup-by-prefix is not limited in that way.

> > +/*
> > + * Find unique prefixes in a given list of strings.
>
> ... and stores the length of the unique prefixes in the
> 'prefix_length' field of the elements of the given array.

Good idea. I changed it to also explain what is meant by "unique
prefix":

 * Given a list of names, find unique prefixes (i.e. the first <n> characters
 * that uniquely identify the names) and store the lengths of the unique
 * prefixes in the 'prefix_length' field of the elements of the given array..

> > + *
> > + * Typically, the `struct prefix_item` information will be but a field in the
>
> s/but //, perhaps?

Sure. I am relatively certain that it is correct grammar, but it is
probably a good idea to remove it.

> > + * actual item struct; For this reason, the `list` parameter is specified as a
> > + * list of pointers to the items.
> > + *
> > + * The `min_length`/`max_length` parameters define what length the unique
> > + * prefixes should have.
> > + *
> > + * If no unique prefix could be found for a given item, its `prefix_length`
> > + * will be set to 0.
> > + */
> > +void find_unique_prefixes(struct prefix_item **list, size_t nr,
>
> The first argument is not a list but an array.

Indeed.

> > +			  int min_length, int max_length);
>
> size_t, perhaps?  These are closely related to
> 'prefix_item.prefix_length', which is already (rightfully) size_t.

True.

> > +#endif
> > diff --git a/t/helper/test-prefix-map.c b/t/helper/test-prefix-map.c
> > new file mode 100644
> > index 0000000000..3f1c90eaf0
> > --- /dev/null
> > +++ b/t/helper/test-prefix-map.c
> > @@ -0,0 +1,58 @@
> > +#include "test-tool.h"
> > +#include "cache.h"
> > +#include "prefix-map.h"
> > +
> > +static size_t test_count, failed_count;
> > +
> > +static void check(int succeeded, const char *file, size_t line_no,
> > +		  const char *fmt, ...)
> > +{
> > +	va_list ap;
> > +
> > +	test_count++;
> > +	if (succeeded)
> > +		return;
> > +
> > +	va_start(ap, fmt);
> > +	fprintf(stderr, "%s:%d: ", file, (int)line_no);
> > +	vfprintf(stderr, fmt, ap);
> > +	fputc('\n', stderr);
> > +	va_end(ap);
> > +
> > +	failed_count++;
> > +}
> > +
> > +#define EXPECT_SIZE_T_EQUALS(expect, actual, hint) \
> > +	check(expect == actual, __FILE__, __LINE__, \
> > +	      "size_t's do not match: %" \
> > +	      PRIdMAX " != %" PRIdMAX " (%s) (%s)", \
> > +	      (intmax_t)expect, (intmax_t)actual, #actual, hint)
> > +
> > +int cmd__prefix_map(int argc, const char **argv)
> > +{
> > +#define NR 5
> > +	struct prefix_item items[NR] = {
>
> You don't have to tell the compiler how many elements this array will
> contain, it will figure that out on its own.
>
> > +		{ "unique" },
> > +		{ "hell" },
> > +		{ "hello" },
> > +		{ "wok" },
> > +		{ "world" },
> > +	};
> > +	struct prefix_item *list[NR] = {
>
> Likewise.

That is correct.

What the compiler _cannot_ figure out, on its own, however, is that
`items` and `list` _need_ to contain the same number of items.

Hence the need for `NR`.

> > +		items, items + 1, items + 2, items + 3, items + 4
> > +	};
> > +
> > +	find_unique_prefixes(list, NR, 1, 3);
>
> This could be find_unique_prefixes(list, ARRAY_SIZE(list), 1, 3), and
> then there is no need for that NR macro anymore.
>
> > +
> > +#define EXPECT_PREFIX_LENGTH_EQUALS(expect, index) \
> > +	EXPECT_SIZE_T_EQUALS(expect, list[index]->prefix_length, \
> > +			     list[index]->name)
> > +
> > +	EXPECT_PREFIX_LENGTH_EQUALS(1, 0);
> > +	EXPECT_PREFIX_LENGTH_EQUALS(0, 1);
> > +	EXPECT_PREFIX_LENGTH_EQUALS(0, 2);
> > +	EXPECT_PREFIX_LENGTH_EQUALS(3, 3);
> > +	EXPECT_PREFIX_LENGTH_EQUALS(3, 4);
> > +
> > +	return !!failed_count;
> > +}

Thank you for your review!
Dscho