Re: [PATCH] git-blame: Make the output human readable

linux@xxxxxxxxxxx · 6 Mar 2006 14:33:26 -0500

Well, getting 15 characters in UTF-8 is easy (just stop before the 16th
byte for which ((b & 0xc0) != 0x80)), but what about combining characters?

You've got accents and stuff to worry about.  And the annoying fact that
Unicode defined accents as suffixes, so you have to go past the 15th
column to include all of the 

And then there's that fact that many characters are traditionally
represented as double-wide forms, even on character terminals.

See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for details
an an example implementation of wcwidth().

Using that, it would be something like (compiles but untested):

/*
 * Return the number of bytes from the nul-terminated utf8 string
 * that can be printed in at most max columns using a monospaced
 * font.  *actual returns the number of columns actually occupied,
 * which may be less than max.
 *
 * Output is truncated before any control characters or illegal
 * UTF-8 sequences.
 */
unsigned
fit_columns(char const *utf8, unsigned max, unsigned *actual)
{
	char const * const origin = utf8;
	unsigned width = 0;
	unsigned pos = 0;
	unsigned c;

	for (;;) {
		unsigned w;
		unsigned c = *utf8++;

		/* Part 1: Parse the next Unicode code point */
		if (c < 0x20) {
			break;	/* Control character - stop */
		} else if (c < 0x7F) {
			w = 1;	/* Standard ASCII */
		} else if (c < 0xC2 || c > 0xF4) {
			break;	/* DEL or illegal Unicode */
		} else {
			/* Multi-byte UTF-8 sequence */
			unsigned n;
			unsigned char byte = *utf8++;

			if (c < 0xE0) {
				/* 2-byte sequence: U+0080..U+07FF */
				n = 1;
				c &= 0x1F;
			} else if (c < 0xF0) {
				/* 3-byte sequence: U+0800..U+FFFF */
				if (c == 0xE0 && byte < 0xA0)
					break;	/* < /U+0800 */
				n = 2;
				c &= 0x0F;
			} else {
				/* 4-byte sequence: U+10000..U+10FFFF */
				if (byte < 0x90 ? c == 0xF0 : c == 0xF4)
					break; /* < 10000 or > 10FFFF */
				n = 3;
				c &= 0x07;
			}

			for (; n--; byte = *utf8++) {
				if (byte & 0xc0 != 0x80)
					goto done;	/* Double break */
				c = (c << 6) | (byte & 0x3f);
			}
			/* Now find the width of it */
			w = wcwidth(c);
			if (w == -1)
				break;
		}

		/* Part 2: Figure out if it will fit */
		if (width + w > max)
			break;	/* Would exceed space - stop */
		/* Part 3: It fits; update our statistics */
		width += w;
		pos = (unsigned)(utf8 - origin);
	}

done:
	if (actual)
		*actual = width;
	return pos;
}
-
: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html