Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx> writes: > utf8data.h_shipped has a large database table which is an auto-generated > decodification trie for the unicode normalization functions. > It is not necessary to carry this large table in the kernel hence make > UTF-8 encoding loadable by converting it into a module. > Also, modify the file called unicode-core which will act as a layer for > unicode subsystem. It will load the UTF-8 module and access it's functions > whenever any filesystem that needs unicode is mounted. > > Signed-off-by: Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx> Hi Shreeya, > --- > fs/unicode/Kconfig | 7 +- > fs/unicode/Makefile | 5 +- > fs/unicode/unicode-core.c | 201 ++++++------------------------- > fs/unicode/utf8-core.c | 112 +++++++++++++++++ > fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++ > include/linux/unicode.h | 20 ++++ > 6 files changed, 427 insertions(+), 164 deletions(-) > create mode 100644 fs/unicode/utf8-core.c > create mode 100644 fs/unicode/utf8mod.c > > diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig > index 2c27b9a5cd6c..33a27deef729 100644 > --- a/fs/unicode/Kconfig > +++ b/fs/unicode/Kconfig > @@ -8,7 +8,12 @@ config UNICODE > Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding > support. > > +config UNICODE_UTF8 > + tristate "UTF-8 module" > + depends on UNICODE > + default m > + > config UNICODE_NORMALIZATION_SELFTEST > tristate "Test UTF-8 normalization support" > - depends on UNICODE > + depends on UNICODE_UTF8 > default n > diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile > index fbf9a629ed0d..9dbb04194b32 100644 > --- a/fs/unicode/Makefile > +++ b/fs/unicode/Makefile > @@ -1,11 +1,14 @@ > # SPDX-License-Identifier: GPL-2.0 > > obj-$(CONFIG_UNICODE) += unicode.o > +obj-$(CONFIG_UNICODE_UTF8) += utf8.o > obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o > > -unicode-y := utf8-norm.o unicode-core.o > +unicode-y := unicode-core.o > +utf8-y := utf8mod.o utf8-norm.o > > $(obj)/utf8-norm.o: $(obj)/utf8data.h > +$(obj)/utf8mod.o: $(obj)/utf8-norm.o > > # In the normal build, the checked-in utf8data.h is just shipped. > # > diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c > index d5f09e022ac5..b832341f1e7b 100644 > --- a/fs/unicode/unicode-core.c > +++ b/fs/unicode/unicode-core.c > @@ -7,70 +7,29 @@ > #include <linux/errno.h> > #include <linux/unicode.h> > #include <linux/stringhash.h> > +#include <linux/delay.h> > > -#include "utf8n.h" > +struct unicode_ops *utf8_ops; > + > +static int unicode_load_module(void); This is unnecessary > > int unicode_validate(const struct unicode_map *um, const struct qstr *str) > { > - const struct utf8data *data = utf8nfdi(um->version); > - > - if (utf8nlen(data, str->name, str->len) < 0) > - return -1; > - return 0; > + return utf8_ops->validate(um, str); > } > EXPORT_SYMBOL(unicode_validate); > > int unicode_strncmp(const struct unicode_map *um, > const struct qstr *s1, const struct qstr *s2) > { > - const struct utf8data *data = utf8nfdi(um->version); > - struct utf8cursor cur1, cur2; > - int c1, c2; > - > - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > - return -EINVAL; > - > - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) > - return -EINVAL; > - > - do { > - c1 = utf8byte(&cur1); > - c2 = utf8byte(&cur2); > - > - if (c1 < 0 || c2 < 0) > - return -EINVAL; > - if (c1 != c2) > - return 1; > - } while (c1); > - > - return 0; > + return utf8_ops->strncmp(um, s1, s2); > } I think these would go on a header file and inlined. > EXPORT_SYMBOL(unicode_strncmp); > > int unicode_strncasecmp(const struct unicode_map *um, > const struct qstr *s1, const struct qstr *s2) > { > - const struct utf8data *data = utf8nfdicf(um->version); > - struct utf8cursor cur1, cur2; > - int c1, c2; > - > - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > - return -EINVAL; > - > - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) > - return -EINVAL; > - > - do { > - c1 = utf8byte(&cur1); > - c2 = utf8byte(&cur2); > - > - if (c1 < 0 || c2 < 0) > - return -EINVAL; > - if (c1 != c2) > - return 1; > - } while (c1); > - > - return 0; > + return utf8_ops->strncasecmp(um, s1, s2); > } > EXPORT_SYMBOL(unicode_strncasecmp); > > @@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um, > const struct qstr *cf, > const struct qstr *s1) > { > - const struct utf8data *data = utf8nfdicf(um->version); > - struct utf8cursor cur1; > - int c1, c2; > - int i = 0; > - > - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > - return -EINVAL; > - > - do { > - c1 = utf8byte(&cur1); > - c2 = cf->name[i++]; > - if (c1 < 0) > - return -EINVAL; > - if (c1 != c2) > - return 1; > - } while (c1); > - > - return 0; > + return utf8_ops->strncasecmp_folded(um, cf, s1); > } > EXPORT_SYMBOL(unicode_strncasecmp_folded); > > int unicode_casefold(const struct unicode_map *um, const struct qstr *str, > unsigned char *dest, size_t dlen) > { > - const struct utf8data *data = utf8nfdicf(um->version); > - struct utf8cursor cur; > - size_t nlen = 0; > - > - if (utf8ncursor(&cur, data, str->name, str->len) < 0) > - return -EINVAL; > - > - for (nlen = 0; nlen < dlen; nlen++) { > - int c = utf8byte(&cur); > - > - dest[nlen] = c; > - if (!c) > - return nlen; > - if (c == -1) > - break; > - } > - return -EINVAL; > + return utf8_ops->casefold(um, str, dest, dlen); > } > EXPORT_SYMBOL(unicode_casefold); > > int unicode_casefold_hash(const struct unicode_map *um, const void *salt, > struct qstr *str) > { > - const struct utf8data *data = utf8nfdicf(um->version); > - struct utf8cursor cur; > - int c; > - unsigned long hash = init_name_hash(salt); > - > - if (utf8ncursor(&cur, data, str->name, str->len) < 0) > - return -EINVAL; > - > - while ((c = utf8byte(&cur))) { > - if (c < 0) > - return -EINVAL; > - hash = partial_name_hash((unsigned char)c, hash); > - } > - str->hash = end_name_hash(hash); > - return 0; > + return utf8_ops->casefold_hash(um, salt, str); > } > EXPORT_SYMBOL(unicode_casefold_hash); > > int unicode_normalize(const struct unicode_map *um, const struct qstr *str, > unsigned char *dest, size_t dlen) > { > - const struct utf8data *data = utf8nfdi(um->version); > - struct utf8cursor cur; > - ssize_t nlen = 0; > + return utf8_ops->normalize(um, str, dest, dlen); > +} > +EXPORT_SYMBOL(unicode_normalize); > > - if (utf8ncursor(&cur, data, str->name, str->len) < 0) > - return -EINVAL; > +struct unicode_map *unicode_load(const char *version) > +{ > + int ret = unicode_load_module(); > > - for (nlen = 0; nlen < dlen; nlen++) { > - int c = utf8byte(&cur); > + if (ret) > + return ERR_PTR(ret); > > - dest[nlen] = c; > - if (!c) > - return nlen; > - if (c == -1) > - break; > - } > - return -EINVAL; > + else > + return utf8_ops->load(version); > } > -EXPORT_SYMBOL(unicode_normalize); > +EXPORT_SYMBOL(unicode_load); > > -static int unicode_parse_version(const char *version, unsigned int *maj, > - unsigned int *min, unsigned int *rev) > +void unicode_unload(struct unicode_map *um) > { > - substring_t args[3]; > - char version_string[12]; > - static const struct match_token token[] = { > - {1, "%d.%d.%d"}, > - {0, NULL} > - }; > + kfree(um); > +} > +EXPORT_SYMBOL(unicode_unload); > > - strncpy(version_string, version, sizeof(version_string)); > +static int unicode_load_module(void) > +{ > + int ret = request_module("utf8"); > > - if (match_token(version_string, token, args) != 1) > - return -EINVAL; > + msleep(100); I think I misunderstood when you mentioned you did this msleep. It was ok to debug the issue you were observing, but it is not a solution. Setting an arbitrary amount of time will either waste time, or you can still fail if things take longer than expected. There are mechanisms to load and wait on a module. See how fs/nls/nls_base.c do exactly this. > - if (match_int(&args[0], maj) || match_int(&args[1], min) || > - match_int(&args[2], rev)) > - return -EINVAL; > + if (ret) { > + pr_err("Failed to load UTF-8 module\n"); > + return ret; > + } > > return 0; > } > > -struct unicode_map *unicode_load(const char *version) > +void unicode_register(struct unicode_ops *ops) > { > - struct unicode_map *um = NULL; > - int unicode_version; > - > - if (version) { > - unsigned int maj, min, rev; > - > - if (unicode_parse_version(version, &maj, &min, &rev) < 0) > - return ERR_PTR(-EINVAL); > - > - if (!utf8version_is_supported(maj, min, rev)) > - return ERR_PTR(-EINVAL); > - > - unicode_version = UNICODE_AGE(maj, min, rev); > - } else { > - unicode_version = utf8version_latest(); > - printk(KERN_WARNING"UTF-8 version not specified. " > - "Assuming latest supported version (%d.%d.%d).", > - (unicode_version >> 16) & 0xff, > - (unicode_version >> 8) & 0xff, > - (unicode_version & 0xff)); > - } > - > - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); > - if (!um) > - return ERR_PTR(-ENOMEM); > - > - um->charset = "UTF-8"; > - um->version = unicode_version; > - > - return um; > + utf8_ops = ops; > } > -EXPORT_SYMBOL(unicode_load); > +EXPORT_SYMBOL(unicode_register); > > -void unicode_unload(struct unicode_map *um) > +void unicode_unregister(void) > { > - kfree(um); > + utf8_ops = NULL; > } > -EXPORT_SYMBOL(unicode_unload); > +EXPORT_SYMBOL(unicode_unregister); > > MODULE_LICENSE("GPL v2"); > diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c > new file mode 100644 > index 000000000000..009faa68330c > --- /dev/null > +++ b/fs/unicode/utf8-core.c > @@ -0,0 +1,112 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#include <linux/module.h> > +#include <linux/kernel.h> > +#include <linux/string.h> > +#include <linux/slab.h> > +#include <linux/parser.h> > +#include <linux/errno.h> > +#include <linux/unicode.h> > +#include <linux/stringhash.h> > +#include <linux/delay.h> > + > +struct unicode_ops *utf8_ops; > + > +static int unicode_load_module(void); > + > +int unicode_validate(const struct unicode_map *um, const struct qstr *str) > +{ > + return utf8_ops->validate(um, str); > +} > +EXPORT_SYMBOL(unicode_validate); > + > +int unicode_strncmp(const struct unicode_map *um, > + const struct qstr *s1, const struct qstr *s2) > +{ > + return utf8_ops->strncmp(um, s1, s2); > +} > +EXPORT_SYMBOL(unicode_strncmp); I'm confused now. Isn't this redefining unicode_strncmp ? It was defined in unicode_core.c on the hunk above and now it is redefined on utf8_core.c. There is something odd here. > + > +int unicode_strncasecmp(const struct unicode_map *um, > + const struct qstr *s1, const struct qstr *s2) > +{ > + return utf8_ops->strncasecmp(um, s1, s2); > +} > +EXPORT_SYMBOL(unicode_strncasecmp); > + > +/* String cf is expected to be a valid UTF-8 casefolded > + * string. > + */ > +int unicode_strncasecmp_folded(const struct unicode_map *um, > + const struct qstr *cf, > + const struct qstr *s1) > +{ > + return utf8_ops->strncasecmp_folded(um, cf, s1); > +} > +EXPORT_SYMBOL(unicode_strncasecmp_folded); > + > +int unicode_casefold(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen) > +{ > + return utf8_ops->casefold(um, str, dest, dlen); > +} > +EXPORT_SYMBOL(unicode_casefold); > + > +int unicode_casefold_hash(const struct unicode_map *um, const void *salt, > + struct qstr *str) > +{ > + return utf8_ops->casefold_hash(um, salt, str); > +} > +EXPORT_SYMBOL(unicode_casefold_hash); > + > +int unicode_normalize(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen) > +{ > + return utf8_ops->normalize(um, str, dest, dlen); > +} > +EXPORT_SYMBOL(unicode_normalize); > + > +struct unicode_map *unicode_load(const char *version) > +{ > + int ret = unicode_load_module(); > + > + if (ret) > + return ERR_PTR(ret); > + > + else > + return utf8_ops->load(version); > +} > +EXPORT_SYMBOL(unicode_load); > + > +void unicode_unload(struct unicode_map *um) > +{ > + kfree(um); > +} > +EXPORT_SYMBOL(unicode_unload); > + > +void unicode_register(struct unicode_ops *ops) > +{ > + utf8_ops = ops; > +} > +EXPORT_SYMBOL(unicode_register); > + > +void unicode_unregister(void) > +{ > + utf8_ops = NULL; > +} > +EXPORT_SYMBOL(unicode_unregister); > + > +static int unicode_load_module(void) > +{ > + int ret = request_module("utf8"); > + > + msleep(100); > + > + if (ret) { > + pr_err("Failed to load UTF-8 module\n"); > + return ret; > + } > + > + return 0; > +} > + > +MODULE_LICENSE("GPL v2"); > diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c > new file mode 100644 > index 000000000000..8eaeeb27255c > --- /dev/null > +++ b/fs/unicode/utf8mod.c > @@ -0,0 +1,246 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#include <linux/module.h> > +#include <linux/kernel.h> > +#include <linux/string.h> > +#include <linux/slab.h> > +#include <linux/parser.h> > +#include <linux/errno.h> > +#include <linux/unicode.h> > +#include <linux/stringhash.h> > + > +#include "utf8n.h" > + > +static int utf8_validate(const struct unicode_map *um, const struct qstr *str) > +{ > + const struct utf8data *data = utf8nfdi(um->version); > + > + if (utf8nlen(data, str->name, str->len) < 0) > + return -1; > + return 0; > +} > + > +static int utf8_strncmp(const struct unicode_map *um, > + const struct qstr *s1, const struct qstr *s2) > +{ > + const struct utf8data *data = utf8nfdi(um->version); > + struct utf8cursor cur1, cur2; > + int c1, c2; > + > + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > + return -EINVAL; > + > + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) > + return -EINVAL; > + > + do { > + c1 = utf8byte(&cur1); > + c2 = utf8byte(&cur2); > + > + if (c1 < 0 || c2 < 0) > + return -EINVAL; > + if (c1 != c2) > + return 1; > + } while (c1); > + > + return 0; > +} > + > +static int utf8_strncasecmp(const struct unicode_map *um, > + const struct qstr *s1, const struct qstr *s2) > +{ > + const struct utf8data *data = utf8nfdicf(um->version); > + struct utf8cursor cur1, cur2; > + int c1, c2; > + > + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > + return -EINVAL; > + > + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) > + return -EINVAL; > + > + do { > + c1 = utf8byte(&cur1); > + c2 = utf8byte(&cur2); > + > + if (c1 < 0 || c2 < 0) > + return -EINVAL; > + if (c1 != c2) > + return 1; > + } while (c1); > + > + return 0; > +} > + > +/* String cf is expected to be a valid UTF-8 casefolded > + * string. > + */ > +static int utf8_strncasecmp_folded(const struct unicode_map *um, > + const struct qstr *cf, > + const struct qstr *s1) > +{ > + const struct utf8data *data = utf8nfdicf(um->version); > + struct utf8cursor cur1; > + int c1, c2; > + int i = 0; > + > + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) > + return -EINVAL; > + > + do { > + c1 = utf8byte(&cur1); > + c2 = cf->name[i++]; > + if (c1 < 0) > + return -EINVAL; > + if (c1 != c2) > + return 1; > + } while (c1); > + > + return 0; > +} > + > +static int utf8_casefold(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen) > +{ > + const struct utf8data *data = utf8nfdicf(um->version); > + struct utf8cursor cur; > + size_t nlen = 0; > + > + if (utf8ncursor(&cur, data, str->name, str->len) < 0) > + return -EINVAL; > + > + for (nlen = 0; nlen < dlen; nlen++) { > + int c = utf8byte(&cur); > + > + dest[nlen] = c; > + if (!c) > + return nlen; > + if (c == -1) > + break; > + } > + return -EINVAL; > +} > + > +static int utf8_casefold_hash(const struct unicode_map *um, const void *salt, > + struct qstr *str) > +{ > + const struct utf8data *data = utf8nfdicf(um->version); > + struct utf8cursor cur; > + int c; > + unsigned long hash = init_name_hash(salt); > + > + if (utf8ncursor(&cur, data, str->name, str->len) < 0) > + return -EINVAL; > + > + while ((c = utf8byte(&cur))) { > + if (c < 0) > + return -EINVAL; > + hash = partial_name_hash((unsigned char)c, hash); > + } > + str->hash = end_name_hash(hash); > + return 0; > +} > + > +static int utf8_normalize(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen) > +{ > + const struct utf8data *data = utf8nfdi(um->version); > + struct utf8cursor cur; > + ssize_t nlen = 0; > + > + if (utf8ncursor(&cur, data, str->name, str->len) < 0) > + return -EINVAL; > + > + for (nlen = 0; nlen < dlen; nlen++) { > + int c = utf8byte(&cur); > + > + dest[nlen] = c; > + if (!c) > + return nlen; > + if (c == -1) > + break; > + } > + return -EINVAL; > +} > + > +static int utf8_parse_version(const char *version, unsigned int *maj, > + unsigned int *min, unsigned int *rev) > +{ > + substring_t args[3]; > + char version_string[12]; > + static const struct match_token token[] = { > + {1, "%d.%d.%d"}, > + {0, NULL} > + }; > + > + strncpy(version_string, version, sizeof(version_string)); > + > + if (match_token(version_string, token, args) != 1) > + return -EINVAL; > + > + if (match_int(&args[0], maj) || match_int(&args[1], min) || > + match_int(&args[2], rev)) > + return -EINVAL; > + > + return 0; > +} > + > +static struct unicode_map *utf8_load(const char *version) > +{ > + struct unicode_map *um = NULL; > + int unicode_version; > + > + if (version) { > + unsigned int maj, min, rev; > + > + if (utf8_parse_version(version, &maj, &min, &rev) < 0) > + return ERR_PTR(-EINVAL); > + > + if (!utf8version_is_supported(maj, min, rev)) > + return ERR_PTR(-EINVAL); > + > + unicode_version = UNICODE_AGE(maj, min, rev); > + } else { > + unicode_version = utf8version_latest(); > + printk(KERN_WARNING"UTF-8 version not specified. " > + "Assuming latest supported version (%d.%d.%d).", > + (unicode_version >> 16) & 0xff, > + (unicode_version >> 8) & 0xff, > + (unicode_version & 0xff)); > + } > + > + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); > + if (!um) > + return ERR_PTR(-ENOMEM); > + > + um->charset = "UTF-8"; > + um->version = unicode_version; > + > + return um; > +} > + > +static struct unicode_ops ops = { > + .validate = utf8_validate, > + .strncmp = utf8_strncmp, > + .strncasecmp = utf8_strncasecmp, > + .strncasecmp_folded = utf8_strncasecmp_folded, > + .casefold = utf8_casefold, > + .casefold_hash = utf8_casefold_hash, > + .normalize = utf8_normalize, > + .load = utf8_load, > +}; > + > +static int __init utf8_init(void) > +{ > + unicode_register(&ops); > + return 0; > +} > + > +static void __exit utf8_exit(void) > +{ > + unicode_unregister(); > +} > + > +module_init(utf8_init); > +module_exit(utf8_exit); > + > +MODULE_LICENSE("GPL v2"); > diff --git a/include/linux/unicode.h b/include/linux/unicode.h > index de23f9ee720b..b0d59069e438 100644 > --- a/include/linux/unicode.h > +++ b/include/linux/unicode.h > @@ -10,6 +10,23 @@ struct unicode_map { > int version; > }; > > +struct unicode_ops { > + int (*validate)(const struct unicode_map *um, const struct qstr *str); > + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1, > + const struct qstr *s2); > + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1, > + const struct qstr *s2); > + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf, > + const struct qstr *s1); > + int (*normalize)(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen); > + int (*casefold)(const struct unicode_map *um, const struct qstr *str, > + unsigned char *dest, size_t dlen); > + int (*casefold_hash)(const struct unicode_map *um, const void *salt, > + struct qstr *str); > + struct unicode_map* (*load)(const char *version); > +}; Also, make sure you run checkpatch.pl on the patch series before submitting. > + > int unicode_validate(const struct unicode_map *um, const struct qstr *str); > > int unicode_strncmp(const struct unicode_map *um, > @@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt, > struct unicode_map *unicode_load(const char *version); > void unicode_unload(struct unicode_map *um); > > +void unicode_register(struct unicode_ops *ops); > +void unicode_unregister(void); > + > #endif /* _LINUX_UNICODE_H */ -- Gabriel Krisman Bertazi