[PATCH 5/7] diff: unified diff with colored words, step 1, unified diff only

Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> · Thu, 31 Dec 2015 19:37:35 +0700

The goal is to produce a unified diff, but with changed words colored
differently. A new diff-words mode is added that can keep track of both
lines and words of each chunk. The marks then are post processed and
each line is output in unified format. The actual word coloring comes in
the next patch.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 diff.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 diff.h |   3 +-
 2 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/diff.c b/diff.c
index 8a9e42f..3b7317e 100644
--- a/diff.c
+++ b/diff.c
@@ -440,6 +440,23 @@ static void check_blank_at_eof(mmfile_t *mf1, mmfile_t *mf2,
 	ecbdata->blank_at_eof_in_postimage = (at - l2) + 1;
 }
 
+#define TAG_BEGIN_WORD(tag) \
+	((tag) == TAG_BEGIN_OLD_WORD || \
+	 (tag) == TAG_BEGIN_NEW_WORD)
+
+enum pointer_tag { /* order is important because it's used in sorting */
+	TAG_END_WORD,
+	TAG_END_LINE,
+	TAG_BEGIN_LINE,
+	TAG_BEGIN_OLD_WORD,
+	TAG_BEGIN_NEW_WORD
+};
+
+struct tagged_pointer {
+	const char *str;
+	enum pointer_tag tag;
+};
+
 static void emit_line_0(struct diff_options *o, const char *set, const char *reset,
 			int first, const char *line, int len)
 {
@@ -757,6 +774,10 @@ struct diff_words_buffer {
 		const char *begin, *end;
 	} *orig;
 	int orig_nr, orig_alloc;
+	unsigned long *line;
+	int line_nr, line_alloc;
+	struct tagged_pointer *mark;
+	int mark_nr, mark_alloc;
 };
 
 struct diff_words_style_elem {
@@ -772,6 +793,8 @@ struct diff_words_style {
 	const char *newline;
 };
 
+static struct diff_words_style diff_words_unified_style;
+
 static struct diff_words_style diff_words_styles[] = {
 	{ DIFF_WORDS_PORCELAIN, {"+", "\n"}, {"-", "\n"}, {" ", "\n"}, "~\n" },
 	{ DIFF_WORDS_PLAIN, {"{+", "+}"}, {"[-", "-]"}, {"", ""}, "\n" },
@@ -803,6 +826,13 @@ static void diff_words_append(struct diff_words_data *diff_words,
 	line++;
 	len--;
 	memcpy(buffer->text.ptr + buffer->text.size, line, len);
+	if (diff_words->type == DIFF_WORDS_UNIFIED) {
+		unsigned long *l;
+		ALLOC_GROW(buffer->line, (buffer->line_nr + 1) * 2, buffer->line_alloc);
+		l = buffer->line + (buffer->line_nr++) * 2;
+		l[0] = buffer->text.size;
+		l[1] = l[0] + len;
+	}
 	buffer->text.size += len;
 	buffer->text.ptr[buffer->text.size] = '\0';
 }
@@ -816,6 +846,40 @@ static int fn_out_diff_words_write_helper(struct diff_words_data *dw,
 	FILE *fp = dw->opt->file;
 	int print = 0;
 
+	if (dw->type == DIFF_WORDS_UNIFIED) {
+		struct diff_words_style *st = &diff_words_unified_style;
+		struct diff_words_buffer *b;
+		enum pointer_tag tag;
+		struct tagged_pointer *tp;
+
+		if (st_el == &st->ctx)
+			return 0;
+		else if (st_el == &st->old)
+			tag = TAG_BEGIN_OLD_WORD;
+		else if (st_el == &st->new)
+			tag = TAG_BEGIN_NEW_WORD;
+		else
+			return -1;
+
+		if (buf >= dw->minus.text.ptr &&
+		    buf < dw->minus.text.ptr + dw->minus.text.size)
+			b = &dw->minus;
+		else if (buf >= dw->plus.text.ptr &&
+			 buf < dw->plus.text.ptr + dw->plus.text.size)
+			b = &dw->plus;
+		else
+			return -1;
+
+		ALLOC_GROW(b->mark, b->mark_nr + 2, b->mark_alloc);
+		tp = b->mark + b->mark_nr;
+		tp[0].str = buf;
+		tp[0].tag = tag;
+		tp[1].str = buf + count;
+		tp[1].tag = TAG_END_WORD;
+		b->mark_nr += 2;
+		return 0;
+	}
+
 	while (count) {
 		char *p = memchr(buf, '\n', count);
 		if (print)
@@ -1014,6 +1078,23 @@ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out,
 	}
 }
 
+static void diff_words_add_line(struct diff_words_buffer *buffer)
+{
+	int i;
+
+	ALLOC_GROW(buffer->mark, buffer->line_nr * 2, buffer->mark_alloc);
+	for (i = 0; i < buffer->line_nr; i++) {
+		struct tagged_pointer *tp = buffer->mark + buffer->mark_nr;
+		tp->str = buffer->text.ptr + buffer->line[i * 2];
+		tp->tag = TAG_BEGIN_LINE;
+		tp++;
+		tp->str = buffer->text.ptr + buffer->line[i * 2 + 1];
+		tp->tag = TAG_END_LINE;
+		buffer->mark_nr += 2;
+	}
+	buffer->line_nr = 0;
+}
+
 /* this executes the word diff on the accumulated buffers */
 static void diff_words_show(struct diff_words_data *diff_words)
 {
@@ -1025,6 +1106,16 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	struct diff_options *opt = diff_words->opt;
 	const char *line_prefix;
 
+	if (diff_words->type == DIFF_WORDS_UNIFIED) {
+		/*
+		 * line marks are collected in line[] array as offsets
+		 * because the "text" buffer can be reallocated. Now
+		 * it's safe to convert line[] to mark[].
+		 */
+		diff_words_add_line(&diff_words->minus);
+		diff_words_add_line(&diff_words->plus);
+	}
+
 	assert(opt);
 	line_prefix = diff_line_prefix(opt);
 
@@ -1077,12 +1168,152 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	diff_words->minus.text.size = diff_words->plus.text.size = 0;
 }
 
+static int tagptrcmp(const void *a_, const void *b_)
+{
+	const struct tagged_pointer *a = a_;
+	const struct tagged_pointer *b = b_;
+	return a->str == b->str ? a->tag > b->tag : a->str > b->str;
+}
+
+static void diff_words_buffer_finalize_marks(struct diff_words_buffer *b)
+{
+	struct tagged_pointer *dst, *src, *next, *end, *cur_word;
+	int i, dst_nr, dst_alloc;
+
+	/* Join consecutive same-type words */
+	end = b->mark + b->mark_nr;
+	for (src = b->mark; src < end; src++) {
+		next = src + 2;
+		while (next + 2 <= end &&
+		       TAG_BEGIN_WORD(src[0].tag) &&
+		       TAG_BEGIN_WORD(next[0].tag) &&
+		       src[0].tag == next[0].tag &&
+		       src[1].tag == TAG_END_WORD &&
+		       next[1].tag == TAG_END_WORD &&
+		       src[1].str == next[0].str) {
+			src[1] = next[1];
+			memcpy(next, next + 2, sizeof(*next) * (end - next - 2));
+			end -= 2;
+		}
+	}
+	b->mark_nr = end - b->mark;
+
+	/*
+	 * Simplify one-word chunks. Not that at this point we have
+	 * all line marks (in correct order), then all word marks.
+	 */
+	if (b->mark_nr >= 4 &&
+	    b->mark[0].tag == TAG_BEGIN_LINE &&
+	    TAG_BEGIN_WORD(b->mark[b->mark_nr - 2].tag) &&
+	    b->mark[0].str == b->mark[b->mark_nr - 2].str &&
+	    b->mark[b->mark_nr - 1].tag == TAG_END_WORD &&
+	    b->mark[b->mark_nr - 3].tag == TAG_END_LINE &&
+	    b->mark[b->mark_nr - 1].str == b->mark[b->mark_nr - 3].str) {
+		b->mark_nr -= 2;
+		return;
+	}
+
+	/* Move words into lines */
+	qsort(b->mark, b->mark_nr, sizeof(*b->mark), tagptrcmp);
+
+	/* Split words that span across lines */
+	cur_word = NULL;
+	dst = NULL;
+	dst_nr = 0;
+	dst_alloc = 0;
+	ALLOC_GROW(dst, b->mark_nr, dst_alloc);
+	for (i = 0; i < b->mark_nr; i++) {
+		struct tagged_pointer *src = b->mark + i;
+
+		switch (src->tag) {
+		case TAG_BEGIN_OLD_WORD:
+		case TAG_BEGIN_NEW_WORD:
+			ALLOC_GROW(dst, dst_nr + 1, dst_alloc);
+			dst[dst_nr++] = *src;
+			cur_word = src;
+			break;
+
+		case TAG_END_WORD:
+			ALLOC_GROW(dst, dst_nr + 1, dst_alloc);
+			dst[dst_nr++] = *src;
+			cur_word = NULL;
+			break;
+
+		case TAG_BEGIN_LINE:
+			ALLOC_GROW(dst, dst_nr + 2, dst_alloc);
+			dst[dst_nr++] = *src;
+			if (cur_word) {
+				dst[dst_nr].tag = cur_word->tag;
+				dst[dst_nr].str = src->str;
+				dst_nr++;
+			}
+			break;
+
+		case TAG_END_LINE:
+			ALLOC_GROW(dst, dst_nr + 2, dst_alloc);
+			if (cur_word) {
+				dst[dst_nr].tag = TAG_END_WORD;
+				dst[dst_nr].str = src->str;
+				dst_nr++;
+			}
+			dst[dst_nr++] = *src;
+			break;
+		}
+	}
+
+	free(b->mark);
+	b->mark = dst;
+	b->mark_nr = dst_nr;
+	b->mark_alloc = dst_alloc;
+}
+
+static void diff_words_flush_unified(struct emit_callback *ecb,
+				     enum color_diff color,
+				     unsigned ws_error_highlight,
+				     char sign)
+{
+	const char *reset = diff_get_color(ecb->color_diff, DIFF_RESET);
+	struct diff_words_data *dw = ecb->diff_words;
+	struct diff_words_buffer *b;
+	struct tagged_pointer *begin_line, *end, *end_line;
+
+	switch (sign) {
+	case '-':
+		b = &dw->minus;
+		break;
+	case '+':
+		b = &dw->plus;
+		break;
+	default:
+		return;
+	}
+
+	if (!b->mark_nr)
+		return;
+
+	diff_words_buffer_finalize_marks(b);
+	end = b->mark + b->mark_nr;
+	for (begin_line = b->mark; begin_line < end; begin_line = end_line + 1) {
+		assert(begin_line->tag == TAG_BEGIN_LINE);
+		end_line = begin_line;
+		while (end_line < end && end_line->tag != TAG_END_LINE)
+			end_line++;
+		assert(end_line->tag == TAG_END_LINE);
+		emit_line_checked(reset, ecb, begin_line->str,
+				  end_line->str - begin_line->str,
+				  color, ws_error_highlight, sign);
+	}
+	b->mark_nr = 0;
+}
+
 /* In "color-words" mode, show word-diff of words accumulated in the buffer */
 static void diff_words_flush(struct emit_callback *ecbdata)
 {
 	if (ecbdata->diff_words->minus.text.size ||
 	    ecbdata->diff_words->plus.text.size)
 		diff_words_show(ecbdata->diff_words);
+	diff_words_flush_unified(ecbdata, DIFF_FILE_OLD, WSEH_OLD, '-');
+	diff_words_flush_unified(ecbdata, DIFF_FILE_NEW, WSEH_NEW, '+');
 }
 
 static void diff_filespec_load_driver(struct diff_filespec *one)
@@ -1133,6 +1364,10 @@ static void init_diff_words_data(struct emit_callback *ecbdata,
 			die ("Invalid regular expression: %s",
 			     o->word_regex);
 	}
+	if (o->word_diff == DIFF_WORDS_UNIFIED) {
+		ecbdata->diff_words->style = &diff_words_unified_style;
+		return;
+	}
 	for (i = 0; i < ARRAY_SIZE(diff_words_styles); i++) {
 		if (o->word_diff == diff_words_styles[i].type) {
 			ecbdata->diff_words->style =
@@ -1155,8 +1390,12 @@ static void free_diff_words_data(struct emit_callback *ecbdata)
 		free (ecbdata->diff_words->opt);
 		free (ecbdata->diff_words->minus.text.ptr);
 		free (ecbdata->diff_words->minus.orig);
+		free (ecbdata->diff_words->minus.line);
+		free (ecbdata->diff_words->minus.mark);
 		free (ecbdata->diff_words->plus.text.ptr);
 		free (ecbdata->diff_words->plus.orig);
+		free (ecbdata->diff_words->plus.line);
+		free (ecbdata->diff_words->plus.mark);
 		if (ecbdata->diff_words->word_regex) {
 			regfree(ecbdata->diff_words->word_regex);
 			free(ecbdata->diff_words->word_regex);
@@ -1274,7 +1513,8 @@ static void fn_out_consume(void *priv, char *line, unsigned long len)
 		if (line[0] == '-' || line[0] == '+') {
 			diff_words_append(ecbdata->diff_words, line, len);
 			return;
-		} else if (starts_with(line, "\\ ")) {
+		} else if (ecbdata->diff_words->type != DIFF_WORDS_UNIFIED &&
+			   starts_with(line, "\\ ")) {
 			/*
 			 * Eat the "no newline at eof" marker as if we
 			 * saw a "+" or "-" line with nothing on it,
@@ -1288,7 +1528,8 @@ static void fn_out_consume(void *priv, char *line, unsigned long len)
 		if (ecbdata->diff_words->type == DIFF_WORDS_PORCELAIN) {
 			emit_line(ecbdata->opt, context, reset, line, len);
 			fputs("~\n", ecbdata->opt->file);
-		} else {
+			return;
+		} else if (ecbdata->diff_words->type != DIFF_WORDS_UNIFIED) {
 			/*
 			 * Skip the prefix character, if any.  With
 			 * diff_suppress_blank_empty, there may be
@@ -1299,8 +1540,8 @@ static void fn_out_consume(void *priv, char *line, unsigned long len)
 			      len--;
 			}
 			emit_line(ecbdata->opt, context, reset, line, len);
+			return;
 		}
-		return;
 	}
 
 	switch (line[0]) {
@@ -3859,6 +4100,15 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		options->word_diff = DIFF_WORDS_COLOR;
 		options->word_regex = arg;
 	}
+	else if (!strcmp(arg, "--highlight-words")) {
+		options->use_color = 1;
+		options->word_diff = DIFF_WORDS_UNIFIED;
+	}
+	else if (skip_prefix(arg, "--highlight-words=", &arg)) {
+		options->use_color = 1;
+		options->word_diff = DIFF_WORDS_UNIFIED;
+		options->word_regex = arg;
+	}
 	else if (!strcmp(arg, "--word-diff")) {
 		if (options->word_diff == DIFF_WORDS_NONE)
 			options->word_diff = DIFF_WORDS_PLAIN;
diff --git a/diff.h b/diff.h
index f7208ad..85c469b 100644
--- a/diff.h
+++ b/diff.h
@@ -107,7 +107,8 @@ enum diff_words_type {
 	DIFF_WORDS_NONE = 0,
 	DIFF_WORDS_PORCELAIN,
 	DIFF_WORDS_PLAIN,
-	DIFF_WORDS_COLOR
+	DIFF_WORDS_COLOR,
+	DIFF_WORDS_UNIFIED
 };
 
 struct diff_options {
-- 
2.3.0.rc1.137.g477eb31

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html