Re: NLS: utf8 conversions

Alan Stern <stern@xxxxxxxxxxxxxxxxxxx> · Mon, 27 Apr 2009 11:51:21 -0400 (EDT)

On Mon, 27 Apr 2009, Clemens Ladisch wrote:

> The single-character utf8_* routines in nls_base.c are just special
> cases of the NLS API for the UTF-8 encoding; the string-oriented
> routines, as far as I can see, are actually only used to do conversions
> between UTF-8 and UTF-16, not wchar_t, so they probably should be
> renamed.
> 
> As for the NLS API itself: If we want to be able to handle code points
> larger than U+FFFF, the obvious answer is to make wchar_t a 32-bit type.
> This should not be too large a problem because the FS NLS API is
> designed so that wchar_t is only used for temporary values, i.e.,
> characters are converted from some on-disk encoding to wchar_t, then
> from wchar_t to some I/O encoding (usually UTF-8); and the conversions
> are done one code point at a time.
> 
> The file systems that use some form of UTF-16 (VFAT, NTFS, CIFS, UDF,
> etc.) use the NLS API in a different way: they treat the individual
> UTF-16 values as wchar_t values and do only the conversion from wchar_t
> to the I/O encoding.  Here we'd need to introduce an additional
> conversion step between UTF-16 and wchar_t, i.e., treat UTF-16 like any
> other multibyte encoding.

Your comments agree pretty well with what I had concluded.  However a 
lot of the source files have lengthy tables of wchar_t values; changing 
them to 32 bits would waste a lot of space.

As a sort of compromise, I came up with this patch (not tested yet,
although it compiles okay).  How does it look to you?

Alan Stern



Index: usb-2.6/include/linux/nls.h
===================================================================

--- usb-2.6.orig/include/linux/nls.h
+++ usb-2.6/include/linux/nls.h
@@ -3,8 +3,23 @@
 
 #include <linux/init.h>
 
-/* unicode character */
-typedef __u16 wchar_t;
+/* Unicode has changed over the years.  Unicode code points no longer
+ * fit into 16 bits; as of Unicode 5 valid code points range from 0
+ * to 0x10ffff (17 planes, where each plane holds 65536 code points).
+ *
+ * The original decision to represent Unicode characters as 16-bit
+ * wchar_t values is now outdated.  But plane 0 still includes the
+ * most commonly used characters, so we will retain it.  The newer
+ * 32-bit unicode_t type can be used when it is necessary to
+ * represent the full Unicode character set.
+ */
+
+/* Plane-0 Unicode character */
+typedef u16 wchar_t;
+#define MAX_WCHAR_T	0xffff
+
+/* Arbitrary Unicode character */
+typedef u32 unicode_t;
 
 struct nls_table {
 	const char *charset;
@@ -21,6 +36,13 @@ struct nls_table {
 /* this value hold the maximum octet of charset */
 #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */
 
+/* Byte order for UTF-16 strings */
+enum utf16_endian {
+	UTF16_HOST_ENDIAN,
+	UTF16_LITTLE_ENDIAN,
+	UTF16_BIG_ENDIAN
+};
+
 /* nls.c */
 extern int register_nls(struct nls_table *);
 extern int unregister_nls(struct nls_table *);
@@ -28,10 +50,10 @@ extern struct nls_table *load_nls(char *
 extern void unload_nls(struct nls_table *);
 extern struct nls_table *load_nls_default(void);
 
-extern int utf8_mbtowc(wchar_t *, const __u8 *, int);
-extern int utf8_mbstowcs(wchar_t *, const __u8 *, int);
-extern int utf8_wctomb(__u8 *, wchar_t, int);
-extern int utf8_wcstombs(__u8 *, const wchar_t *, int);
+extern int utf8_to_utf32(unicode_t *, const u8 *, int);
+extern int utf32_to_utf8(u8 *, unicode_t, int);
+extern int utf8s_to_utf16s(wchar_t *, const u8 *, int);
+extern int utf16s_to_utf8s(u8 *, const wchar_t *, int, int, enum utf16_endian);
 
 static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
 {
Index: usb-2.6/fs/nls/nls_base.c
===================================================================
--- usb-2.6.orig/fs/nls/nls_base.c
+++ usb-2.6/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,18 @@ static const struct utf8_table utf8_tabl
     {0,						       /* end of table    */}
 };
 
+#define UNICODE_MAX	0x0010ffff
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_MASK	0xfffff800
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+
 int
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+utf8_to_utf32(unicode_t *p, const u8 *s, int n)
 {
-	long l;
+	unsigned long l;
 	int c0, c, nc;
 	const struct utf8_table *t;
   
@@ -57,9 +66,10 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, i
 		nc++;
 		if ((c0 & t->cmask) == t->cval) {
 			l &= t->lmask;
-			if (l < t->lval)
+			if (l < t->lval || l > UNICODE_MAX ||
+					(l & SURROGATE_MASK) == SURROGATE_PAIR)
 				return -1;
-			*p = l;
+			*p = (unicode_t) l;
 			return nc;
 		}
 		if (n <= nc)
@@ -72,76 +82,122 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, i
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
 
 int
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
+utf32_to_utf8(u8 *s, unicode_t u, int maxlen)
 {
-	__u16 *op;
-	const __u8 *ip;
-	int size;
-
-	op = pwcs;
-	ip = s;
-	while (*ip && n > 0) {
-		if (*ip & 0x80) {
-			size = utf8_mbtowc(op, ip, n);
-			if (size == -1) {
-				/* Ignore character and move on */
-				ip++;
-				n--;
-			} else {
-				op++;
-				ip += size;
-				n -= size;
-			}
-		} else {
-			*op++ = *ip++;
-			n--;
-		}
-	}
-	return (op - pwcs);
-}
-
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-	long l;
+	unsigned long l;
 	int c, nc;
 	const struct utf8_table *t;
-  
+
 	if (!s)
 		return 0;
-  
-	l = wc;
+
+	l = u;
+	if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -1;
+
 	nc = 0;
 	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
 		nc++;
 		if (l <= t->lmask) {
 			c = t->shift;
-			*s = t->cval | (l >> c);
+			*s = (u8) (t->cval | (l >> c));
 			while (c > 0) {
 				c -= 6;
 				s++;
-				*s = 0x80 | ((l >> c) & 0x3F);
+				*s = (u8) (0x80 | ((l >> c) & 0x3F));
 			}
 			return nc;
 		}
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
 
 int
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
+utf8s_to_utf16s(wchar_t *pwcs, const u8 *s, int n)
 {
-	const __u16 *ip;
-	__u8 *op;
+	u16 *op;
 	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (*s && n > 0) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(&u, s, n);
+			if (size < 0) {
+				/* Ignore character and move on */
+				size = 1;
+			} else if (u >= PLANE_SIZE) {
+				u -= PLANE_SIZE;
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS));
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						SURROGATE_LOW |
+						(u & SURROGATE_BITS));
+			} else {
+				*op++ = (wchar_t) u;
+			}
+			s += size;
+			n -= size;
+		} else {
+			*op++ = *s++;
+			n--;
+		}
+	}
+	return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		return c;
+	case UTF16_LITTLE_ENDIAN:
+		return __le16_to_cpu(c);
+	case UTF16_BIG_ENDIAN:
+		return __be16_to_cpu(c);
+	}
+}
+
+int
+utf16s_to_utf8s(u8 *s, const wchar_t *pwcs, int maxlen, int inlen,
+		enum utf16_endian endian)
+{
+	u8 *op;
+	int size;
+	unsigned long u, v;
 
 	op = s;
-	ip = pwcs;
-	while (*ip && maxlen > 0) {
-		if (*ip > 0x7f) {
-			size = utf8_wctomb(op, *ip, maxlen);
+	while (inlen > 0 && maxlen > 0) {
+		u = get_utf16(*pwcs, endian);
+		if (!u)
+			break;
+		pwcs++;
+		inlen--;
+		if (u > 0x7f) {
+			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+				if (u & SURROGATE_LOW) {
+					/* Ignore character and move on */
+					continue;
+				}
+				if (inlen <= 0)
+					break;
+				v = get_utf16(*pwcs, endian);
+				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+						!(v & SURROGATE_LOW)) {
+					/* Ignore character and move on */
+					continue;
+				}
+				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+						+ (v & SURROGATE_BITS);
+				pwcs++;
+				inlen--;
+			}
+			size = utf32_to_utf8(op, u, maxlen);
 			if (size == -1) {
 				/* Ignore character and move on */
 			} else {
@@ -149,13 +205,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pw
 				maxlen -= size;
 			}
 		} else {
-			*op++ = (__u8) *ip;
+			*op++ = (u8) u;
 			maxlen--;
 		}
-		ip++;
 	}
-	return (op - s);
+	return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +523,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 
 MODULE_LICENSE("Dual BSD/GPL");
Index: usb-2.6/fs/nls/nls_utf8.c
===================================================================
--- usb-2.6.orig/fs/nls/nls_utf8.c
+++ usb-2.6/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigne
 {
 	int n;
 
-	if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(out, uni, boundlen);
+	if (n < 0) {
 		*out = '?';
 		return -EINVAL;
 	}
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigne
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
 	int n;
+	unicode_t u;
 
-	if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+	n = utf8_to_utf32(&u, rawstring, boundlen);
+	if (n < 0 || u > MAX_WCHAR_T) {
 		*uni = 0x003f;	/* ? */
-		n = -EINVAL;
+		return -EINVAL;
 	}
+	*uni = (wchar_t) u;
 	return n;
 }
 
Index: usb-2.6/fs/befs/linuxvfs.c
===================================================================
--- usb-2.6.orig/fs/befs/linuxvfs.c
+++ usb-2.6/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, con
 {
 	struct nls_table *nls = BEFS_SB(sb)->nls;
 	int i, o;
-	wchar_t uni;
+	unicode_t uni;
 	int unilen, utflen;
 	char *result;
 	/* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, con
 	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
 
 		/* convert from UTF-8 to Unicode */
-		utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
-		if (utflen < 0) {
+		utflen = utf8_to_utf32(&uni, &in[i], in_len - i);
+		if (utflen < 0)
 			goto conv_err;
-		}
 
 		/* convert from Unicode to nls */
+		if (uni > MAX_WCHAR_T)
+			goto conv_err;
 		unilen = nls->uni2char(uni, &result[o], in_len - o);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 	}
 	result[o] = '\0';
 	*out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, con
 
 		/* convert from nls to unicode */
 		unilen = nls->char2uni(&in[i], in_len - i, &uni);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 
 		/* convert from unicode to UTF-8 */
-		utflen = utf8_wctomb(&result[o], uni, 3);
-		if (utflen <= 0) {
+		utflen = utf32_to_utf8(&result[o], uni, 3);
+		if (utflen <= 0)
 			goto conv_err;
-		}
 	}
 
 	result[o] = '\0';
Index: usb-2.6/fs/fat/dir.c
===================================================================
--- usb-2.6.orig/fs/fat/dir.c
+++ usb-2.6/fs/fat/dir.c
@@ -22,6 +22,19 @@
 #include <asm/uaccess.h>
 #include "fat.h"
 
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
 				    struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct m
 				unsigned char *buf, int size)
 {
 	if (sbi->options.utf8)
-		return utf8_wcstombs(buf, uni, size);
+		return utf16s_to_utf8s(buf, uni, size, FAT_MAX_UNI_CHARS,
+				UTF16_HOST_ENDIAN);
 	else
 		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
 				   sbi->nls_io);
@@ -325,19 +339,6 @@ parse_long:
 }
 
 /*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-
-/*
  * Return values: negative -> error, 0 -> not found, positive -> found,
  * value is the total amount of slots, including the shortname entry.
  */
Index: usb-2.6/fs/fat/namei_vfat.c
===================================================================
--- usb-2.6.orig/fs/fat/namei_vfat.c
+++ usb-2.6/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, 
 	if (utf8) {
 		int name_len = strlen(name);
 
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+		*outlen = utf8s_to_utf16s((wchar_t *)outname, name, PATH_MAX);
 
 		/*
 		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
+		 * but utf8s_to_utf16s doesn't care about len
 		 */
 		*outlen -= (name_len - len);
 
Index: usb-2.6/fs/isofs/joliet.c
===================================================================
--- usb-2.6.orig/fs/isofs/joliet.c
+++ usb-2.6/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16
 	return (op - ascii);
 }
 
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-	const __u8 *ip;
-	__u8 *op;
-	int size;
-	__u16 c;
-
-	op = s;
-	ip = pwcs;
-	while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-		c = (*ip << 8) | ip[1];
-		if (c > 0x7f) {
-			size = utf8_wctomb(op, c, maxlen);
-			if (size == -1) {
-				/* Ignore character and move on */
-				maxlen--;
-			} else {
-				op += size;
-				maxlen -= size;
-			}
-		} else {
-			*op++ = (__u8) c;
-		}
-		ip += 2;
-		inlen--;
-	}
-	return (op - s);
-}
-
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory
 	nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
 
 	if (utf8) {
-		len = wcsntombs_be(outname, de->name,
-				de->name_len[0] >> 1, PAGE_SIZE);
+		len = utf16s_to_utf8s(outname, (const wchar_t *) de->name,
+				PAGE_SIZE, de->name_len[0] >> 1,
+				UTF16_BIG_ENDIAN);
 	} else {
 		len = uni16_to_x8(outname, (__be16 *) de->name,
 				de->name_len[0] >> 1, nls);
Index: usb-2.6/fs/ncpfs/ncplib_kernel.c
===================================================================
--- usb-2.6.orig/fs/ncpfs/ncplib_kernel.c
+++ usb-2.6/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, u
 
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
+			unicode_t u;
 
-			k = utf8_mbtowc(&ec, iname, iname_end - iname);
-			if (k < 0)
+			k = utf8_to_utf32(&u, iname, iname_end - iname);
+			if (k < 0 || u > MAX_WCHAR_T)
 				return -EINVAL;
 			iname += k;
+			ec = u;
 		} else {
 			if (*iname == NCP_ESC) {
 				int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, u
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
 
-			k = utf8_wctomb(iname, ec, iname_end - iname);
+			k = utf32_to_utf8(iname, ec, iname_end - iname);
 			if (k < 0) {
 				err = -ENAMETOOLONG;
 				goto quit;
Index: usb-2.6/drivers/usb/core/message.c
===================================================================
--- usb-2.6.orig/drivers/usb/core/message.c
+++ usb-2.6/drivers/usb/core/message.c
@@ -780,14 +780,13 @@ int usb_string(struct usb_device *dev, i
 {
 	unsigned char *tbuf;
 	int err;
-	unsigned int u;
 
 	if (dev->state == USB_STATE_SUSPENDED)
 		return -EHOSTUNREACH;
 	if (size <= 0 || !buf || !index)
 		return -EINVAL;
 	buf[0] = 0;
-	tbuf = kmalloc(256 + 2, GFP_NOIO);
+	tbuf = kmalloc(256, GFP_NOIO);
 	if (!tbuf)
 		return -ENOMEM;
 
@@ -814,12 +813,9 @@ int usb_string(struct usb_device *dev, i
 	if (err < 0)
 		goto errout;
 
-	for (u = 2; u < err; u += 2)
-		le16_to_cpus((u16 *)&tbuf[u]);
-	tbuf[u] = 0;
-	tbuf[u + 1] = 0;
 	size--;		/* leave room for trailing NULL char in output buffer */
-	err = utf8_wcstombs(buf, (u16 *)&tbuf[2], size);
+	err = utf16s_to_utf8s(buf, (wchar_t *) &tbuf[2], size, (err - 2) / 2,
+			UTF16_LITTLE_ENDIAN);
 	buf[err] = 0;
 
 	if (tbuf[1] != USB_DT_STRING)

--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html