Re: [PATCH] diff-delta.c: pack the index structure

David Kastrup <dak@xxxxxxx> · Sat, 08 Sep 2007 22:50:20 +0200

David Kastrup <dak@xxxxxxx> writes:

> Another option avoiding the realloc would be not to use linked lists
> at all but just collect all entries sequentually in "packed" form,
> and sort them into order afterwards.  That's an O(n lg n) operation
> with a large n.  Even if one has a sorting algorithm with good
> memory locality, I doubt that the locality would compensate for the
> lg n factor, even when taking into account that we save ourselves
> the copying.  And that is not even taken the possibility of having
> to cull some buckets into account, another O(n) operation (which
> amounts to copying everything once, too).

As an illustration, I have implemented a proof-of-concept.  This is a
patch (not formatted for submission) against the current master.  It
takes about a 20% performance hit on the whole git-pack-objects run,
so the diffing code itself is likely affected worse.  For the
proof-of-concept I did not do the hash bucket collision culling, nor
did I realloc to the possibly reduced size.  So a fairer comparison
would likely involve removing the culling code from master (or
implementing the culling in the proof-of-concept, but I am too lazy
for that).

I hope this addresses the not-in-place complaint.  It is conceivable
that under tight memory conditions, this approach might actually work
out better.  However, the glibc qsort implementation (at least it did
so at one time) reverts to a mergesort for large numbers, and
allocates a temporary buffer of the same size as the original data.
If that is still the case, the memory impact would actually be worse
here.  One would need to implement a custom _real_ in place sorter to
get around this.  All in all, I don't think this approach is worth the
trouble: the sorting overhead factor of lg n is just too much.

diff --git a/diff-delta.c b/diff-delta.c
index 0dde2f2..f6e2416 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -115,7 +115,6 @@ static const unsigned int U[256] = {
 struct index_entry {
 	const unsigned char *ptr;
 	unsigned int val;
-	struct index_entry *next;
 };
 
 struct delta_index {
@@ -126,12 +125,24 @@ struct delta_index {
 	struct index_entry *hash[FLEX_ARRAY];
 };
 
+static unsigned int cmp_hmask;
+
+static int cmp_index_entry(const void *va, const void *vb)
+{
+	const struct index_entry *a = va, *b = vb;
+	if ((a->val & cmp_hmask) != (b->val & cmp_hmask))
+		return (a->val & cmp_hmask) < (b->val & cmp_hmask) ? -1 : 1;
+	if (a->val != b->val)
+		return a->val < b->val ? -1 : 1;
+	return a->ptr < b->ptr ? -1 : 1;
+}
+
 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 {
-	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
+	unsigned int i, hsize, hmask, entries, prev_val, ihash;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
-	struct index_entry *entry, **hash;
+	struct index_entry *entry, **hash, *entry_base;
 	void *mem;
 	unsigned long memsize;
 
@@ -149,7 +160,7 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 
 	/* allocate lookup index */
 	memsize = sizeof(*index) +
-		  sizeof(*hash) * hsize +
+		  sizeof(*hash) * (hsize + 1) +
 		  sizeof(*entry) * entries;
 	mem = malloc(memsize);
 	if (!mem)
@@ -157,21 +168,14 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	index = mem;
 	mem = index + 1;
 	hash = mem;
-	mem = hash + hsize;
-	entry = mem;
+	mem = hash + hsize + 1;
+	entry_base = mem;
+	entry = entry_base;
 
 	index->memsize = memsize;
 	index->src_buf = buf;
 	index->src_size = bufsize;
 	index->hash_mask = hmask;
-	memset(hash, 0, hsize * sizeof(*hash));
-
-	/* allocate an array to count hash entries */
-	hash_count = calloc(hsize, sizeof(*hash_count));
-	if (!hash_count) {
-		free(index);
-		return NULL;
-	}
 
 	/* then populate the index */
 	prev_val = ~0;
@@ -184,29 +188,33 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 		if (val == prev_val) {
 			/* keep the lowest of consecutive identical blocks */
 			entry[-1].ptr = data + RABIN_WINDOW;
+			--entries;
 		} else {
 			prev_val = val;
 			i = val & hmask;
 			entry->ptr = data + RABIN_WINDOW;
 			entry->val = val;
-			entry->next = hash[i];
-			hash[i] = entry++;
-			hash_count[i]++;
+			entry++;
 		}
 	}
+	cmp_hmask = hmask;
+	qsort(entry_base, entries, sizeof(struct index_entry), cmp_index_entry);
+	ihash = 0;
+	
+	for (i=0; i<entries; i++) {
+		entry = &entry_base[i];
+		if ((entry->val & hmask) >= ihash) {
+			/* cull here */
+			do {
+				hash[ihash++] = entry;
+			} while ((entry->val & hmask) >= ihash);
+		}
+	}
+	while (ihash <= hsize) {
+		hash[ihash++] = &entry_base[entries];
+	}
 
-	/*
-	 * Determine a limit on the number of entries in the same hash
-	 * bucket.  This guards us against pathological data sets causing
-	 * really bad hash distribution with most entries in the same hash
-	 * bucket that would bring us to O(m*n) computing costs (m and n
-	 * corresponding to reference and target buffer sizes).
-	 *
-	 * Make sure none of the hash buckets has more entries than
-	 * we're willing to test.  Otherwise we cull the entry list
-	 * uniformly to still preserve a good repartition across
-	 * the reference buffer.
-	 */
+#if 0
 	for (i = 0; i < hsize; i++) {
 		if (hash_count[i] < HASH_LIMIT)
 			continue;
@@ -221,6 +229,7 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 		} while(entry);
 	}
 	free(hash_count);
+#endif
 
 	return index;
 }
@@ -302,7 +311,7 @@ create_delta(const struct delta_index *index,
 			val ^= U[data[-RABIN_WINDOW]];
 			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 			i = val & index->hash_mask;
-			for (entry = index->hash[i]; entry; entry = entry->next) {
+			for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
 				unsigned int ref_size = ref_top - ref;

-- 
David Kastrup, Kriemhildstr. 15, 44793 Bochum