On 14/03/21 7:19 am, Gabriel Krisman Bertazi wrote:
Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx> writes:utf8data.h_shipped has a large database table which is an auto-generated decodification trie for the unicode normalization functions. It is not necessary to carry this large table in the kernel hence make UTF-8 encoding loadable by converting it into a module. Also, modify the file called unicode-core which will act as a layer for unicode subsystem. It will load the UTF-8 module and access it's functions whenever any filesystem that needs unicode is mounted. Signed-off-by: Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx>Hi Shreeya,
Hi Gabriel,
sorry, I think I messed up patches while using git send-email and that is why you might see two copies of the last patch. Let me resend the series and then it might make sense. One question--- fs/unicode/Kconfig | 7 +- fs/unicode/Makefile | 5 +- fs/unicode/unicode-core.c | 201 ++++++------------------------- fs/unicode/utf8-core.c | 112 +++++++++++++++++ fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++ include/linux/unicode.h | 20 ++++ 6 files changed, 427 insertions(+), 164 deletions(-) create mode 100644 fs/unicode/utf8-core.c create mode 100644 fs/unicode/utf8mod.c diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 2c27b9a5cd6c..33a27deef729 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -8,7 +8,12 @@ config UNICODE Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding support.+config UNICODE_UTF8+ tristate "UTF-8 module" + depends on UNICODE + default m + config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE_UTF8 default n diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index fbf9a629ed0d..9dbb04194b32 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -1,11 +1,14 @@ # SPDX-License-Identifier: GPL-2.0obj-$(CONFIG_UNICODE) += unicode.o+obj-$(CONFIG_UNICODE_UTF8) += utf8.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o-unicode-y := utf8-norm.o unicode-core.o+unicode-y := unicode-core.o +utf8-y := utf8mod.o utf8-norm.o$(obj)/utf8-norm.o: $(obj)/utf8data.h+$(obj)/utf8mod.o: $(obj)/utf8-norm.o# In the normal build, the checked-in utf8data.h is just shipped.# diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c index d5f09e022ac5..b832341f1e7b 100644 --- a/fs/unicode/unicode-core.c +++ b/fs/unicode/unicode-core.c @@ -7,70 +7,29 @@ #include <linux/errno.h> #include <linux/unicode.h> #include <linux/stringhash.h> +#include <linux/delay.h>-#include "utf8n.h"+struct unicode_ops *utf8_ops; + +static int unicode_load_module(void);This is unnecessaryint unicode_validate(const struct unicode_map *um, const struct qstr *str){ - const struct utf8data *data = utf8nfdi(um->version); - - if (utf8nlen(data, str->name, str->len) < 0) - return -1; - return 0; + return utf8_ops->validate(um, str); } EXPORT_SYMBOL(unicode_validate);int unicode_strncmp(const struct unicode_map *um,const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdi(um->version); - struct utf8cursor cur1, cur2; - int c1, c2; - - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) - return -EINVAL; - - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) - return -EINVAL; - - do { - c1 = utf8byte(&cur1); - c2 = utf8byte(&cur2); - - if (c1 < 0 || c2 < 0) - return -EINVAL; - if (c1 != c2) - return 1; - } while (c1); - - return 0; + return utf8_ops->strncmp(um, s1, s2); }I think these would go on a header file and inlined.EXPORT_SYMBOL(unicode_strncmp);int unicode_strncasecmp(const struct unicode_map *um,const struct qstr *s1, const struct qstr *s2) { - const struct utf8data *data = utf8nfdicf(um->version); - struct utf8cursor cur1, cur2; - int c1, c2; - - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) - return -EINVAL; - - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) - return -EINVAL; - - do { - c1 = utf8byte(&cur1); - c2 = utf8byte(&cur2); - - if (c1 < 0 || c2 < 0) - return -EINVAL; - if (c1 != c2) - return 1; - } while (c1); - - return 0; + return utf8_ops->strncasecmp(um, s1, s2); } EXPORT_SYMBOL(unicode_strncasecmp);@@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,const struct qstr *cf, const struct qstr *s1) { - const struct utf8data *data = utf8nfdicf(um->version); - struct utf8cursor cur1; - int c1, c2; - int i = 0; - - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) - return -EINVAL; - - do { - c1 = utf8byte(&cur1); - c2 = cf->name[i++]; - if (c1 < 0) - return -EINVAL; - if (c1 != c2) - return 1; - } while (c1); - - return 0; + return utf8_ops->strncasecmp_folded(um, cf, s1); } EXPORT_SYMBOL(unicode_strncasecmp_folded);int unicode_casefold(const struct unicode_map *um, const struct qstr *str,unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdicf(um->version); - struct utf8cursor cur; - size_t nlen = 0; - - if (utf8ncursor(&cur, data, str->name, str->len) < 0) - return -EINVAL; - - for (nlen = 0; nlen < dlen; nlen++) { - int c = utf8byte(&cur); - - dest[nlen] = c; - if (!c) - return nlen; - if (c == -1) - break; - } - return -EINVAL; + return utf8_ops->casefold(um, str, dest, dlen); } EXPORT_SYMBOL(unicode_casefold);int unicode_casefold_hash(const struct unicode_map *um, const void *salt,struct qstr *str) { - const struct utf8data *data = utf8nfdicf(um->version); - struct utf8cursor cur; - int c; - unsigned long hash = init_name_hash(salt); - - if (utf8ncursor(&cur, data, str->name, str->len) < 0) - return -EINVAL; - - while ((c = utf8byte(&cur))) { - if (c < 0) - return -EINVAL; - hash = partial_name_hash((unsigned char)c, hash); - } - str->hash = end_name_hash(hash); - return 0; + return utf8_ops->casefold_hash(um, salt, str); } EXPORT_SYMBOL(unicode_casefold_hash);int unicode_normalize(const struct unicode_map *um, const struct qstr *str,unsigned char *dest, size_t dlen) { - const struct utf8data *data = utf8nfdi(um->version); - struct utf8cursor cur; - ssize_t nlen = 0; + return utf8_ops->normalize(um, str, dest, dlen); +} +EXPORT_SYMBOL(unicode_normalize);- if (utf8ncursor(&cur, data, str->name, str->len) < 0)- return -EINVAL; +struct unicode_map *unicode_load(const char *version) +{ + int ret = unicode_load_module();- for (nlen = 0; nlen < dlen; nlen++) {- int c = utf8byte(&cur); + if (ret) + return ERR_PTR(ret);- dest[nlen] = c;- if (!c) - return nlen; - if (c == -1) - break; - } - return -EINVAL; + else + return utf8_ops->load(version); } -EXPORT_SYMBOL(unicode_normalize); +EXPORT_SYMBOL(unicode_load);-static int unicode_parse_version(const char *version, unsigned int *maj,- unsigned int *min, unsigned int *rev) +void unicode_unload(struct unicode_map *um) { - substring_t args[3]; - char version_string[12]; - static const struct match_token token[] = { - {1, "%d.%d.%d"}, - {0, NULL} - }; + kfree(um); +} +EXPORT_SYMBOL(unicode_unload);- strncpy(version_string, version, sizeof(version_string));+static int unicode_load_module(void) +{ + int ret = request_module("utf8");- if (match_token(version_string, token, args) != 1)- return -EINVAL; + msleep(100);I think I misunderstood when you mentioned you did this msleep. It was ok to debug the issue you were observing, but it is not a solution. Setting an arbitrary amount of time will either waste time, or you can still fail if things take longer than expected. There are mechanisms to load and wait on a module. See how fs/nls/nls_base.c do exactly this.- if (match_int(&args[0], maj) || match_int(&args[1], min) || - match_int(&args[2], rev)) - return -EINVAL; + if (ret) { + pr_err("Failed to load UTF-8 module\n"); + return ret; + }return 0;}-struct unicode_map *unicode_load(const char *version)+void unicode_register(struct unicode_ops *ops) { - struct unicode_map *um = NULL; - int unicode_version; - - if (version) { - unsigned int maj, min, rev; - - if (unicode_parse_version(version, &maj, &min, &rev) < 0) - return ERR_PTR(-EINVAL); - - if (!utf8version_is_supported(maj, min, rev)) - return ERR_PTR(-EINVAL); - - unicode_version = UNICODE_AGE(maj, min, rev); - } else { - unicode_version = utf8version_latest(); - printk(KERN_WARNING"UTF-8 version not specified. " - "Assuming latest supported version (%d.%d.%d).", - (unicode_version >> 16) & 0xff, - (unicode_version >> 8) & 0xff, - (unicode_version & 0xff)); - } - - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); - if (!um) - return ERR_PTR(-ENOMEM); - - um->charset = "UTF-8"; - um->version = unicode_version; - - return um; + utf8_ops = ops; } -EXPORT_SYMBOL(unicode_load); +EXPORT_SYMBOL(unicode_register);-void unicode_unload(struct unicode_map *um)+void unicode_unregister(void) { - kfree(um); + utf8_ops = NULL; } -EXPORT_SYMBOL(unicode_unload); +EXPORT_SYMBOL(unicode_unregister);MODULE_LICENSE("GPL v2");diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c new file mode 100644 index 000000000000..009faa68330c --- /dev/null +++ b/fs/unicode/utf8-core.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/errno.h> +#include <linux/unicode.h> +#include <linux/stringhash.h> +#include <linux/delay.h> + +struct unicode_ops *utf8_ops; + +static int unicode_load_module(void); + +int unicode_validate(const struct unicode_map *um, const struct qstr *str) +{ + return utf8_ops->validate(um, str); +} +EXPORT_SYMBOL(unicode_validate); + +int unicode_strncmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + return utf8_ops->strncmp(um, s1, s2); +} +EXPORT_SYMBOL(unicode_strncmp);I'm confused now. Isn't this redefining unicode_strncmp ? It was defined in unicode_core.c on the hunk above and now it is redefined on utf8_core.c. There is something odd here.
though, why would unicode_strncmp go into the header file?
I ran checkpatch.pl over the patch, but it seems there were some previously existing warnings+ +int unicode_strncasecmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + return utf8_ops->strncasecmp(um, s1, s2); +} +EXPORT_SYMBOL(unicode_strncasecmp); + +/* String cf is expected to be a valid UTF-8 casefolded + * string. + */ +int unicode_strncasecmp_folded(const struct unicode_map *um, + const struct qstr *cf, + const struct qstr *s1) +{ + return utf8_ops->strncasecmp_folded(um, cf, s1); +} +EXPORT_SYMBOL(unicode_strncasecmp_folded); + +int unicode_casefold(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + return utf8_ops->casefold(um, str, dest, dlen); +} +EXPORT_SYMBOL(unicode_casefold); + +int unicode_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str) +{ + return utf8_ops->casefold_hash(um, salt, str); +} +EXPORT_SYMBOL(unicode_casefold_hash); + +int unicode_normalize(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + return utf8_ops->normalize(um, str, dest, dlen); +} +EXPORT_SYMBOL(unicode_normalize); + +struct unicode_map *unicode_load(const char *version) +{ + int ret = unicode_load_module(); + + if (ret) + return ERR_PTR(ret); + + else + return utf8_ops->load(version); +} +EXPORT_SYMBOL(unicode_load); + +void unicode_unload(struct unicode_map *um) +{ + kfree(um); +} +EXPORT_SYMBOL(unicode_unload); + +void unicode_register(struct unicode_ops *ops) +{ + utf8_ops = ops; +} +EXPORT_SYMBOL(unicode_register); + +void unicode_unregister(void) +{ + utf8_ops = NULL; +} +EXPORT_SYMBOL(unicode_unregister); + +static int unicode_load_module(void) +{ + int ret = request_module("utf8"); + + msleep(100); + + if (ret) { + pr_err("Failed to load UTF-8 module\n"); + return ret; + } + + return 0; +} + +MODULE_LICENSE("GPL v2"); diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c new file mode 100644 index 000000000000..8eaeeb27255c --- /dev/null +++ b/fs/unicode/utf8mod.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/errno.h> +#include <linux/unicode.h> +#include <linux/stringhash.h> + +#include "utf8n.h" + +static int utf8_validate(const struct unicode_map *um, const struct qstr *str) +{ + const struct utf8data *data = utf8nfdi(um->version); + + if (utf8nlen(data, str->name, str->len) < 0) + return -1; + return 0; +} + +static int utf8_strncmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + const struct utf8data *data = utf8nfdi(um->version); + struct utf8cursor cur1, cur2; + int c1, c2; + + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + return -EINVAL; + + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_strncasecmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur1, cur2; + int c1, c2; + + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + return -EINVAL; + + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +/* String cf is expected to be a valid UTF-8 casefolded + * string. + */ +static int utf8_strncasecmp_folded(const struct unicode_map *um, + const struct qstr *cf, + const struct qstr *s1) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur1; + int c1, c2; + int i = 0; + + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = cf->name[i++]; + if (c1 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_casefold(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + size_t nlen = 0; + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + for (nlen = 0; nlen < dlen; nlen++) { + int c = utf8byte(&cur); + + dest[nlen] = c; + if (!c) + return nlen; + if (c == -1) + break; + } + return -EINVAL; +} + +static int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + int c; + unsigned long hash = init_name_hash(salt); + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + while ((c = utf8byte(&cur))) { + if (c < 0) + return -EINVAL; + hash = partial_name_hash((unsigned char)c, hash); + } + str->hash = end_name_hash(hash); + return 0; +} + +static int utf8_normalize(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfdi(um->version); + struct utf8cursor cur; + ssize_t nlen = 0; + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + for (nlen = 0; nlen < dlen; nlen++) { + int c = utf8byte(&cur); + + dest[nlen] = c; + if (!c) + return nlen; + if (c == -1) + break; + } + return -EINVAL; +} + +static int utf8_parse_version(const char *version, unsigned int *maj, + unsigned int *min, unsigned int *rev) +{ + substring_t args[3]; + char version_string[12]; + static const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + + strncpy(version_string, version, sizeof(version_string)); + + if (match_token(version_string, token, args) != 1) + return -EINVAL; + + if (match_int(&args[0], maj) || match_int(&args[1], min) || + match_int(&args[2], rev)) + return -EINVAL; + + return 0; +} + +static struct unicode_map *utf8_load(const char *version) +{ + struct unicode_map *um = NULL; + int unicode_version; + + if (version) { + unsigned int maj, min, rev; + + if (utf8_parse_version(version, &maj, &min, &rev) < 0) + return ERR_PTR(-EINVAL); + + if (!utf8version_is_supported(maj, min, rev)) + return ERR_PTR(-EINVAL); + + unicode_version = UNICODE_AGE(maj, min, rev); + } else { + unicode_version = utf8version_latest(); + printk(KERN_WARNING"UTF-8 version not specified. " + "Assuming latest supported version (%d.%d.%d).", + (unicode_version >> 16) & 0xff, + (unicode_version >> 8) & 0xff, + (unicode_version & 0xff)); + } + + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); + if (!um) + return ERR_PTR(-ENOMEM); + + um->charset = "UTF-8"; + um->version = unicode_version; + + return um; +} + +static struct unicode_ops ops = { + .validate = utf8_validate, + .strncmp = utf8_strncmp, + .strncasecmp = utf8_strncasecmp, + .strncasecmp_folded = utf8_strncasecmp_folded, + .casefold = utf8_casefold, + .casefold_hash = utf8_casefold_hash, + .normalize = utf8_normalize, + .load = utf8_load, +}; + +static int __init utf8_init(void) +{ + unicode_register(&ops); + return 0; +} + +static void __exit utf8_exit(void) +{ + unicode_unregister(); +} + +module_init(utf8_init); +module_exit(utf8_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/unicode.h b/include/linux/unicode.h index de23f9ee720b..b0d59069e438 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -10,6 +10,23 @@ struct unicode_map { int version; };+struct unicode_ops {+ int (*validate)(const struct unicode_map *um, const struct qstr *str); + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1, + const struct qstr *s2); + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1, + const struct qstr *s2); + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf, + const struct qstr *s1); + int (*normalize)(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen); + int (*casefold)(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen); + int (*casefold_hash)(const struct unicode_map *um, const void *salt, + struct qstr *str); + struct unicode_map* (*load)(const char *version); +};Also, make sure you run checkpatch.pl on the patch series before submitting.
which are not introduced due to any change made in this patch series.I am not sure if I am supposed to resolve those warnings in this patch series.
+ int unicode_validate(const struct unicode_map *um, const struct qstr *str);int unicode_strncmp(const struct unicode_map *um,@@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt, struct unicode_map *unicode_load(const char *version); void unicode_unload(struct unicode_map *um);+void unicode_register(struct unicode_ops *ops);+void unicode_unregister(void); + #endif /* _LINUX_UNICODE_H */