[PATCH v2] combine-diff: coalesce lost lines optimally

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This replaces the greedy implementation to coalesce lost lines by using
dynamic programming to find the Longest Common Subsequence.

The O(n²) time complexity is obviously bigger than previous
implementation but it can produce shorter diff results (and most likely
easier to read).

List of lost lines is now doubly-linked because we reverse-read it when
reading the direction matrix.

Signed-off-by: Antoine Pelisse <apelisse@xxxxxxxxx>
---
Hey,
This version includes some tests and is based on
ap/combine-diff-ignore-whitespace.
As you can see the last test is broken because the solution is not
optimal for more than two parents. It would probably require to extend
the dynamic programming to a k-dimension matrix (for k parents) but the
result would end-up being O(n^k) (when removing n consecutives lines
from p parents). I'm not sure there is any better solution known yet to
the k-LCS problem.
Implementing the dynamic solution with the k-dimension matrix would
probably require to re-hash the strings (I guess it's already done by
xdiff), as the number of string comparisons would increase.

Anyway I'm not exactly sure where to go from here :)

Cheers,
Antoine

 combine-diff.c           |  255 ++++++++++++++++++++++++++++++++++------------
 t/t4038-diff-combined.sh |  129 +++++++++++++++++++++++
 2 files changed, 320 insertions(+), 64 deletions(-)

diff --git a/combine-diff.c b/combine-diff.c
index 6288485..77d7872 100644
--- a/combine-diff.c
+++ b/combine-diff.c
@@ -74,16 +74,24 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr,

 /* Lines lost from parent */
 struct lline {
-	struct lline *next;
+	struct lline *next, *prev;
 	int len;
 	unsigned long parent_map;
 	char line[FLEX_ARRAY];
 };

+/* Lines lost from current parent (before coalescing) */
+struct plost {
+	struct lline *lost_head, *lost_tail;
+	int len;
+};
+
 /* Lines surviving in the merge result */
 struct sline {
-	struct lline *lost_head, **lost_tail;
-	struct lline *next_lost;
+	/* Accumulated and coalesced lost lines */
+	struct lline *lost;
+	int lenlost;
+	struct plost plost;
 	char *bol;
 	int len;
 	/* bit 0 up to (N-1) are on if the parent has this line (i.e.
@@ -95,34 +103,6 @@ struct sline {
 	unsigned long *p_lno;
 };

-static char *grab_blob(const unsigned char *sha1, unsigned int mode,
-		       unsigned long *size, struct userdiff_driver *textconv,
-		       const char *path)
-{
-	char *blob;
-	enum object_type type;
-
-	if (S_ISGITLINK(mode)) {
-		blob = xmalloc(100);
-		*size = snprintf(blob, 100,
-				 "Subproject commit %s\n", sha1_to_hex(sha1));
-	} else if (is_null_sha1(sha1)) {
-		/* deleted blob */
-		*size = 0;
-		return xcalloc(1, 1);
-	} else if (textconv) {
-		struct diff_filespec *df = alloc_filespec(path);
-		fill_filespec(df, sha1, 1, mode);
-		*size = fill_textconv(textconv, df, &blob);
-		free_filespec(df);
-	} else {
-		blob = read_sha1_file(sha1, &type, size);
-		if (type != OBJ_BLOB)
-			die("object '%s' is not a blob!", sha1_to_hex(sha1));
-	}
-	return blob;
-}
-
 static int match_string_spaces(const char *line1, int len1,
 			       const char *line2, int len2,
 			       long flags)
@@ -163,36 +143,180 @@ static int match_string_spaces(const char *line1, int len1,
 	return 0;
 }

-static void append_lost(struct sline *sline, int n, const char *line, int len, long flags)
+enum coalesce_direction { MATCH, BASE, NEW };
+
+/* Coalesce new lines into base by finding LCS */
+static struct lline *coalesce_lines(struct lline *base, int *lenbase,
+				    struct lline *new, int lennew,
+				    unsigned long parent, long flags)
 {
-	struct lline *lline;
-	unsigned long this_mask = (1UL<<n);
-	if (line[len-1] == '\n')
-		len--;
+	int **lcs;
+	enum coalesce_direction **directions;
+	struct lline *baseend, *newend = NULL;
+	int i, j, origbaselen = *lenbase;

-	/* Check to see if we can squash things */
-	if (sline->lost_head) {
-		lline = sline->next_lost;
-		while (lline) {
-			if (match_string_spaces(lline->line, lline->len,
-						line, len, flags)) {
-				lline->parent_map |= this_mask;
-				sline->next_lost = lline->next;
-				return;
+	if (new == NULL)
+		return base;
+
+	if (base == NULL) {
+		*lenbase = lennew;
+		return new;
+	}
+
+	/*
+	 * Coalesce new lines into base by finding the LCS
+	 * - Create the table to run dynamic programing
+	 * - Compute the LCS
+	 * - Then reverse read the direction structure:
+	 *   - If we have MATCH, assign parent to base flag, and consume
+	 *   both baseend and newend
+	 *   - Else if we have BASE, consume baseend
+	 *   - Else if we have NEW, insert newend lline into base and
+	 *   consume newend
+	 */
+	lcs = xcalloc(origbaselen + 1, sizeof(int*));
+	directions = xcalloc(origbaselen + 1, sizeof(enum coalesce_direction*));
+	for (i = 0; i < origbaselen + 1; i++) {
+		lcs[i] = xcalloc(lennew + 1, sizeof(int));
+		directions[i] = xcalloc(lennew + 1, sizeof(enum coalesce_direction));
+		directions[i][0] = BASE;
+	}
+	for (j = 1; j < lennew + 1; j++)
+		directions[0][j] = NEW;
+
+	for (i = 1, baseend = base; i < origbaselen + 1; i++) {
+		for (j = 1, newend = new; j < lennew + 1; j++) {
+			if (match_string_spaces(baseend->line, baseend->len,
+						newend->line, newend->len, flags)) {
+				lcs[i][j] = lcs[i - 1][j - 1] + 1;
+				directions[i][j] = MATCH;
+			} else if (lcs[i][j - 1] >= lcs[i - 1][j]) {
+				lcs[i][j] = lcs[i][j - 1];
+				directions[i][j] = NEW;
+			} else {
+				lcs[i][j] = lcs[i - 1][j];
+				directions[i][j] = BASE;
+			}
+			if (newend->next)
+				newend = newend->next;
+		}
+		if (baseend->next)
+			baseend = baseend->next;
+	}
+
+	for (i = 0; i < origbaselen + 1; i++)
+		free(lcs[i]);
+	free(lcs);
+
+	/* At this point, baseend and newend point to the end of each lists */
+	i--;
+	j--;
+	while (i != 0 || j != 0) {
+		if (directions[i][j] == MATCH) {
+			baseend->parent_map |= 1<<parent;
+			baseend = baseend->prev;
+			newend = newend->prev;
+			i--;
+			j--;
+		} else if (directions[i][j] == NEW) {
+			struct lline *lline;
+
+			lline = newend;
+			/* Remove lline from new list and update newend */
+			if (lline->prev)
+				lline->prev->next = lline->next;
+			else
+				new = lline->next;
+			if (lline->next)
+				lline->next->prev = lline->prev;
+
+			newend = lline->prev;
+			j--;
+
+			/* Add lline to base list */
+			if (baseend) {
+				lline->next = baseend->next;
+				lline->prev = baseend;
+				if (lline->prev)
+					lline->prev->next = lline;
 			}
-			lline = lline->next;
+			else {
+				lline->next = base;
+				base = lline;
+			}
+			(*lenbase)++;
+
+			if (lline->next)
+				lline->next->prev = lline;
+
+		} else {
+			baseend = baseend->prev;
+			i--;
 		}
 	}

+	newend = new;
+	while (newend) {
+		struct lline *lline = newend;
+		newend = newend->next;
+		free(lline);
+	}
+
+	for (i = 0; i < origbaselen + 1; i++)
+		free(directions[i]);
+	free(directions);
+
+	return base;
+}
+
+static char *grab_blob(const unsigned char *sha1, unsigned int mode,
+		       unsigned long *size, struct userdiff_driver *textconv,
+		       const char *path)
+{
+	char *blob;
+	enum object_type type;
+
+	if (S_ISGITLINK(mode)) {
+		blob = xmalloc(100);
+		*size = snprintf(blob, 100,
+				 "Subproject commit %s\n", sha1_to_hex(sha1));
+	} else if (is_null_sha1(sha1)) {
+		/* deleted blob */
+		*size = 0;
+		return xcalloc(1, 1);
+	} else if (textconv) {
+		struct diff_filespec *df = alloc_filespec(path);
+		fill_filespec(df, sha1, 1, mode);
+		*size = fill_textconv(textconv, df, &blob);
+		free_filespec(df);
+	} else {
+		blob = read_sha1_file(sha1, &type, size);
+		if (type != OBJ_BLOB)
+			die("object '%s' is not a blob!", sha1_to_hex(sha1));
+	}
+	return blob;
+}
+
+static void append_lost(struct sline *sline, int n, const char *line, int len)
+{
+	struct lline *lline;
+	unsigned long this_mask = (1UL<<n);
+	if (line[len-1] == '\n')
+		len--;
+
 	lline = xmalloc(sizeof(*lline) + len + 1);
 	lline->len = len;
 	lline->next = NULL;
+	lline->prev = sline->plost.lost_tail;
+	if (lline->prev)
+		lline->prev->next = lline;
+	else
+		sline->plost.lost_head = lline;
+	sline->plost.lost_tail = lline;
+	sline->plost.len++;
 	lline->parent_map = this_mask;
 	memcpy(lline->line, line, len);
 	lline->line[len] = 0;
-	*sline->lost_tail = lline;
-	sline->lost_tail = &lline->next;
-	sline->next_lost = NULL;
 }

 struct combine_diff_state {
@@ -203,7 +327,6 @@ struct combine_diff_state {
 	int n;
 	struct sline *sline;
 	struct sline *lost_bucket;
-	long flags;
 };

 static void consume_line(void *state_, char *line, unsigned long len)
@@ -236,14 +359,13 @@ static void consume_line(void *state_, char *line, unsigned long len)
 				xcalloc(state->num_parent,
 					sizeof(unsigned long));
 		state->sline[state->nb-1].p_lno[state->n] = state->ob;
-		state->lost_bucket->next_lost = state->lost_bucket->lost_head;
 		return;
 	}
 	if (!state->lost_bucket)
 		return; /* not in any hunk yet */
 	switch (line[0]) {
 	case '-':
-		append_lost(state->lost_bucket, state->n, line+1, len-1, state->flags);
+		append_lost(state->lost_bucket, state->n, line+1, len-1);
 		break;
 	case '+':
 		state->sline[state->lno-1].flag |= state->nmask;
@@ -276,7 +398,6 @@ static void combine_diff(const unsigned char *parent, unsigned int mode,
 	xpp.flags = flags;
 	memset(&xecfg, 0, sizeof(xecfg));
 	memset(&state, 0, sizeof(state));
-	state.flags = flags;
 	state.nmask = nmask;
 	state.sline = sline;
 	state.lno = 1;
@@ -298,8 +419,18 @@ static void combine_diff(const unsigned char *parent, unsigned int mode,
 		struct lline *ll;
 		sline[lno].p_lno[n] = p_lno;

+		/* Coalesce new lines */
+		if (sline[lno].plost.lost_head) {
+			struct sline *sl = &sline[lno];
+			sl->lost = coalesce_lines(sl->lost, &sl->lenlost,
+						  sl->plost.lost_head,
+						  sl->plost.len, n, flags);
+			sl->plost.lost_head = sl->plost.lost_tail = NULL;
+			sl->plost.len = 0;
+		}
+
 		/* How many lines would this sline advance the p_lno? */
-		ll = sline[lno].lost_head;
+		ll = sline[lno].lost;
 		while (ll) {
 			if (ll->parent_map & nmask)
 				p_lno++; /* '-' means parent had it */
@@ -319,7 +450,7 @@ static int interesting(struct sline *sline, unsigned long all_mask)
 	/* If some parents lost lines here, or if we have added to
 	 * some parent, it is interesting.
 	 */
-	return ((sline->flag & all_mask) || sline->lost_head);
+	return ((sline->flag & all_mask) || sline->lost);
 }

 static unsigned long adjust_hunk_tail(struct sline *sline,
@@ -502,7 +633,7 @@ static int make_hunks(struct sline *sline, unsigned long cnt,
 		has_interesting = 0;
 		for (j = i; j < hunk_end && !has_interesting; j++) {
 			unsigned long this_diff = sline[j].flag & all_mask;
-			struct lline *ll = sline[j].lost_head;
+			struct lline *ll = sline[j].lost;
 			if (this_diff) {
 				/* This has some changes.  Is it the
 				 * same as others?
@@ -656,7 +787,7 @@ static void dump_sline(struct sline *sline, const char *line_prefix,
 			int j;
 			unsigned long p_mask;
 			struct sline *sl = &sline[lno++];
-			ll = (sl->flag & no_pre_delete) ? NULL : sl->lost_head;
+			ll = (sl->flag & no_pre_delete) ? NULL : sl->lost;
 			while (ll) {
 				printf("%s%s", line_prefix, c_old);
 				for (j = 0; j < num_parent; j++) {
@@ -707,7 +838,7 @@ static void reuse_combine_diff(struct sline *sline, unsigned long cnt,
 	jmask = (1UL<<j);

 	for (lno = 0; lno <= cnt; lno++) {
-		struct lline *ll = sline->lost_head;
+		struct lline *ll = sline->lost;
 		sline->p_lno[i] = sline->p_lno[j];
 		while (ll) {
 			if (ll->parent_map & jmask)
@@ -966,10 +1097,6 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,

 	sline = xcalloc(cnt+2, sizeof(*sline));
 	sline[0].bol = result;
-	for (lno = 0; lno <= cnt + 1; lno++) {
-		sline[lno].lost_tail = &sline[lno].lost_head;
-		sline[lno].flag = 0;
-	}
 	for (lno = 0, cp = result; cp < result + result_size; cp++) {
 		if (*cp == '\n') {
 			sline[lno].len = cp - sline[lno].bol;
@@ -1019,8 +1146,8 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
 	free(result);

 	for (lno = 0; lno < cnt; lno++) {
-		if (sline[lno].lost_head) {
-			struct lline *ll = sline[lno].lost_head;
+		if (sline[lno].lost) {
+			struct lline *ll = sline[lno].lost;
 			while (ll) {
 				struct lline *tmp = ll;
 				ll = ll->next;
diff --git a/t/t4038-diff-combined.sh b/t/t4038-diff-combined.sh
index b7e16a7..1261dbb 100755
--- a/t/t4038-diff-combined.sh
+++ b/t/t4038-diff-combined.sh
@@ -224,4 +224,133 @@ test_expect_success 'check combined output (ignore all spaces)' '
 	compare_diff_patch expected actual
 '

+test_expect_success 'combine diff coalesce simple' '
+	>test &&
+	git add test &&
+	git commit -m initial &&
+	test_seq 4 >test &&
+	git commit -a -m empty1 &&
+	git branch side1 &&
+	git checkout HEAD^ &&
+	test_seq 5 >test &&
+	git commit -a -m empty2 &&
+	test_must_fail git merge side1 &&
+	>test &&
+	git commit -a -m merge &&
+	git show >actual.tmp &&
+	sed -e "1,/^@@@/d" < actual.tmp >actual &&
+	tr -d Q <<-\EOF >expected &&
+	--1
+	--2
+	--3
+	--4
+	- 5
+	EOF
+	compare_diff_patch expected actual
+'
+
+test_expect_success 'combine diff coalesce tricky' '
+	>test &&
+	git add test &&
+	git commit -m initial --allow-empty &&
+	cat <<-\EOF >test &&
+	3
+	1
+	2
+	3
+	4
+	EOF
+	git commit -a -m empty1 &&
+	git branch -f side1 &&
+	git checkout HEAD^ &&
+	cat <<-\EOF >test &&
+	1
+	3
+	5
+	4
+	EOF
+	git commit -a -m empty2 &&
+	git branch -f side2 &&
+	test_must_fail git merge side1 &&
+	>test &&
+	git commit -a -m merge &&
+	git show >actual.tmp &&
+	sed -e "1,/^@@@/d" < actual.tmp >actual &&
+	tr -d Q <<-\EOF >expected &&
+	 -3
+	--1
+	 -2
+	--3
+	- 5
+	--4
+	EOF
+	compare_diff_patch expected actual &&
+	git checkout -f side1 &&
+	test_must_fail git merge side2 &&
+	>test &&
+	git commit -a -m merge &&
+	git show >actual.tmp &&
+	sed -e "1,/^@@@/d" < actual.tmp >actual &&
+	tr -d Q <<-\EOF >expected &&
+	- 3
+	--1
+	- 2
+	--3
+	 -5
+	--4
+	EOF
+	compare_diff_patch expected actual
+'
+
+test_expect_failure 'combine diff coalesce three parents' '
+	>test &&
+	git add test &&
+	git commit -m initial --allow-empty &&
+	cat <<-\EOF >test &&
+	3
+	1
+	2
+	3
+	4
+	EOF
+	git commit -a -m empty1 &&
+	git checkout -B side1 &&
+	git checkout HEAD^ &&
+	cat <<-\EOF >test &&
+	1
+	3
+	7
+	5
+	4
+	EOF
+	git commit -a -m empty2 &&
+	git branch -f side2 &&
+	git checkout HEAD^ &&
+	cat <<-\EOF >test &&
+	3
+	1
+	6
+	5
+	4
+	EOF
+	git commit -a -m empty3 &&
+	>test &&
+	git add test &&
+	TREE=$(git write-tree) &&
+	COMMIT=$(git commit-tree -p HEAD -p side1 -p side2 -m merge $TREE) &&
+	git show $COMMIT >actual.tmp &&
+	sed -e "1,/^@@@/d" < actual.tmp >actual &&
+	tr -d Q <<-\EOF >expected &&
+	-- 3
+	---1
+	-  6
+	 - 2
+	 --3
+	  -7
+	- -5
+	---4
+	EOF
+	compare_diff_patch expected actual
+'
+
 test_done
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]