Changes since RFC v2: - Integrate with NLS Changes since RFC v1: - Change error return code from EIO to EINVAL. (Olaf Weber) - Fix issues with strncmp/strcmp. (Olaf Weber) - Remove stack buffer in normalization/casefold. (Olaf Weber) - Include length parameter for second string on comparison functions. - Change length type to size_t. Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxxxx> --- fs/nls/Makefile | 2 +- fs/nls/nls_utf8n-core.c | 291 ++++++++++++++++++++++++++++++++++++++++ fs/nls/nls_utf8n-norm.c | 6 + fs/nls/utf8n.h | 1 + 4 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 fs/nls/nls_utf8n-core.c diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 6ff62c0fe436..3650bb58534b 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o -nls_utf8n-y += nls_utf8n-norm.o +nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h diff --git a/fs/nls/nls_utf8n-core.c b/fs/nls/nls_utf8n-core.c new file mode 100644 index 000000000000..56e1dd07047c --- /dev/null +++ b/fs/nls/nls_utf8n-core.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2017 Collabora Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/nls.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/string.h> + +#include "utf8n.h" + +static struct nls_charset utf8norm_info; + +static int utf8_strncmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_strncasecmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_casefold(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char **folded) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur; + char *s; + ssize_t nlen; + + nlen = utf8nlen(data, str, len); + if (nlen < 0) + return -EINVAL; + + s = kmalloc(nlen + 1, GFP_NOFS); + if (!s) + return -ENOMEM; + *folded = s; + + utf8ncursor(&cur, data, str, len); + do { + *s = utf8byte(&cur); + } while (*s++); + + return nlen; +} + +static int utf8_normalize(const struct nls_table *charset, + const unsigned char *str, + size_t len, unsigned char **normalization) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur; + char *s; + ssize_t nlen; + + nlen = utf8nlen(data, str, len); + if (nlen < 0) + return -EINVAL; + + s = kmalloc(nlen + 1, GFP_NOFS); + if (!s) + return -ENOMEM; + *normalization = s; + + utf8ncursor(&cur, data, str, len); + do { + *s = utf8byte(&cur); + } while (*s++); + + return nlen; +} + +static int utf8_uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { + *out = '?'; + return -EINVAL; + } + return n; +} + +static int utf8_char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + int n; + unicode_t u; + + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { + *uni = 0x003f; /* ? */ + return -EINVAL; + } + *uni = (wchar_t) u; + return n; +} + +static unsigned char utf8_tolower(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static unsigned char utf8_toupper(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static const struct nls_ops utf8_ops = { + .strncmp = utf8_strncmp, + .strncasecmp = utf8_strncasecmp, + .casefold = utf8_casefold, + .normalize = utf8_normalize, + .lowercase = utf8_tolower, + .uppercase = utf8_toupper, + .uni2char = utf8_uni2char, + .char2uni = utf8_char2uni, +}; + +static int utf8_parse_version(const char *version, unsigned int *maj, + unsigned int *min, unsigned int *rev) +{ + substring_t args[3]; + char *tmp; + const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + int ret = 0; + + tmp = kstrdup(version, GFP_KERNEL); + if (match_token(tmp, token, args) != 1) { + ret = -EINVAL; + goto out; + } + + if (match_int(&args[0], maj) || match_int(&args[1], min) || + match_int(&args[2], rev)) { + ret = -EINVAL; + goto out; + } +out: + kfree(tmp); + return ret; +} + +static struct nls_table *utf8_load_charset(const char *version) +{ + struct nls_table *tbl = NULL; + unsigned int nls_version; + + if (version) { + unsigned int maj, min, rev; + + if (utf8_parse_version(version, &maj, &min, &rev) < 0) + return ERR_PTR(-EINVAL); + + if (!utf8version_is_supported(maj, min, rev)) + return ERR_PTR(-EINVAL); + + nls_version = UNICODE_AGE(maj, min, rev); + } else { + nls_version = utf8version_latest(); + printk(KERN_WARNING"utf8norm version not specified. " + "Assuming latest supported version (%d.%d.%d).", + (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff, + (nls_version & 0xff)); + } + + /* Try an already loaded table first. */ + for (tbl = utf8norm_info.tables; tbl; tbl = tbl->next) { + if (tbl->version == nls_version) + return tbl; + } + + tbl = kmalloc(sizeof(struct nls_table), GFP_KERNEL); + if (!tbl) + return ERR_PTR(-ENOMEM); + + tbl->charset = &utf8norm_info; + tbl->version = nls_version; + tbl->ops = &utf8_ops; + + tbl->next = utf8norm_info.tables; + utf8norm_info.tables = tbl; + + return tbl; +} + +static void utf8_cleanup_tables(void) +{ + struct nls_table *tmp, *tbl = utf8norm_info.tables; + + while (tbl) { + tmp = tbl; + tbl = tbl->next; + kfree(tmp); + } + utf8norm_info.tables = NULL; +} + +static struct nls_charset utf8norm_info = { + .charset = "utf8n", + .load_table = utf8_load_charset, +}; + +static int __init init_utf8(void) +{ + register_nls(&utf8norm_info); + return 0; +} + +static void __exit exit_utf8(void) +{ + unregister_nls(&utf8norm_info); + utf8_cleanup_tables(); +} + +module_init(init_utf8); +module_exit(exit_utf8); +MODULE_AUTHOR("SGI, Gabriel Krisman Bertazi"); +MODULE_DESCRIPTION("UTF-8 charset operations for filesystems"); +MODULE_LICENSE("GPL"); diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c index 64c3cc74a2ca..abee8b376a87 100644 --- a/fs/nls/nls_utf8n-norm.c +++ b/fs/nls/nls_utf8n-norm.c @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev) } EXPORT_SYMBOL(utf8version_is_supported); +int utf8version_latest() +{ + return utf8vers; +} +EXPORT_SYMBOL(utf8version_latest); + /* * UTF-8 valid ranges. * diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h index f60827663503..b4697f9bfbab 100644 --- a/fs/nls/utf8n.h +++ b/fs/nls/utf8n.h @@ -32,6 +32,7 @@ /* Highest unicode version supported by the data tables. */ extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); +extern int utf8version_latest(void); /* * Look for the correct const struct utf8data for a unicode version. -- 2.17.0