Christoph Hellwig <hch@xxxxxx> writes: > utf8data.h contains a large database table which is an auto-generated > decodification trie for the unicode normalization functions. > > Allow building it into a separate module. > > Based on a patch from Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx>. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/unicode/Kconfig | 13 ++++- > fs/unicode/Makefile | 13 ++--- > fs/unicode/mkutf8data.c | 24 ++++++++-- > fs/unicode/utf8-core.c | 35 +++++++++++--- > fs/unicode/utf8-norm.c | 48 ++++--------------- > fs/unicode/utf8-selftest.c | 16 +++---- > ...{utf8data.h_shipped => utf8data.c_shipped} | 22 +++++++-- > fs/unicode/utf8n.h | 40 ++++++++-------- > include/linux/unicode.h | 2 + > 9 files changed, 123 insertions(+), 90 deletions(-) > rename fs/unicode/{utf8data.h_shipped => utf8data.c_shipped} (99%) > > diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig > index 2c27b9a5cd6ce..610d7bc05d6e3 100644 > --- a/fs/unicode/Kconfig > +++ b/fs/unicode/Kconfig > @@ -8,7 +8,16 @@ config UNICODE > Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding > support. > > +config UNICODE_UTF8_DATA > + tristate "UTF-8 normalization and casefolding tables" > + depends on UNICODE > + default UNICODE > + help > + This contains a large table of case foldings, which can be loaded as > + a separate module if you say M here. To be on the safe side stick > + to the default of Y. Saying N here makes no sense, if you do not want > + utf8 casefolding support, disable CONFIG_UNICODE instead. > + > config UNICODE_NORMALIZATION_SELFTEST > tristate "Test UTF-8 normalization support" > - depends on UNICODE > - default n > + depends on UNICODE_UTF8_DATA > diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile > index b88aecc865502..2f9d9188852b5 100644 > --- a/fs/unicode/Makefile > +++ b/fs/unicode/Makefile > @@ -2,14 +2,15 @@ > > obj-$(CONFIG_UNICODE) += unicode.o > obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o > +obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o > > unicode-y := utf8-norm.o utf8-core.o > > -$(obj)/utf8-norm.o: $(obj)/utf8data.h > +$(obj)/utf8-data.o: $(obj)/utf8data.c > > -# In the normal build, the checked-in utf8data.h is just shipped. > +# In the normal build, the checked-in utf8data.c is just shipped. > # > -# To generate utf8data.h from UCD, put *.txt files in this directory > +# To generate utf8data.c from UCD, put *.txt files in this directory > # and pass REGENERATE_UTF8DATA=1 from the command line. > ifdef REGENERATE_UTF8DATA > > @@ -24,15 +25,15 @@ quiet_cmd_utf8data = GEN $@ > -t $(srctree)/$(src)/NormalizationTest.txt \ > -o $@ > > -$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE > +$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE > $(call if_changed,utf8data) > > else > > -$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE > +$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE > $(call if_changed,shipped) > > endif > > -targets += utf8data.h > +targets += utf8data.c > hostprogs += mkutf8data > diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c > index ff2025ac5a325..bc1a7c8b5c8df 100644 > --- a/fs/unicode/mkutf8data.c > +++ b/fs/unicode/mkutf8data.c > @@ -3287,12 +3287,10 @@ static void write_file(void) > open_fail(utf8_name, errno); > > fprintf(file, "/* This file is generated code, do not edit. */\n"); > - fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); > - fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); > - fprintf(file, "#endif\n"); > fprintf(file, "\n"); > - fprintf(file, "static const unsigned int utf8vers = %#x;\n", > - unicode_maxage); > + fprintf(file, "#include <linux/module.h>\n"); > + fprintf(file, "#include <linux/kernel.h>\n"); > + fprintf(file, "#include \"utf8n.h\"\n"); > fprintf(file, "\n"); > fprintf(file, "static const unsigned int utf8agetab[] = {\n"); > for (i = 0; i != ages_count; i++) > @@ -3339,6 +3337,22 @@ static void write_file(void) > fprintf(file, "\n"); > } > fprintf(file, "};\n"); > + fprintf(file, "\n"); > + fprintf(file, "struct utf8data_table utf8_data_table = {\n"); > + fprintf(file, "\t.utf8agetab = utf8agetab,\n"); > + fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); > + fprintf(file, "\n"); > + fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); > + fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); > + fprintf(file, "\n"); > + fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); > + fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); > + fprintf(file, "\n"); > + fprintf(file, "\t.utf8data = utf8data,\n"); > + fprintf(file, "};\n"); > + fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); > + fprintf(file, "\n"); > + fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); > fclose(file); > } > > diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c > index d9f713d38c0ad..38ca824f10158 100644 > --- a/fs/unicode/utf8-core.c > +++ b/fs/unicode/utf8-core.c > @@ -160,25 +160,45 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str, > } > EXPORT_SYMBOL(utf8_normalize); > > +static const struct utf8data *find_table_version(const struct utf8data *table, > + size_t nr_entries, unsigned int version) > +{ > + size_t i = nr_entries - 1; > + > + while (version < table[i].maxage) > + i--; > + if (version > table[i].maxage) > + return NULL; > + return &table[i]; > +} > + > struct unicode_map *utf8_load(unsigned int version) > { > struct unicode_map *um; > > - if (!utf8version_is_supported(version)) > - return ERR_PTR(-EINVAL); > - > um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); > if (!um) > return ERR_PTR(-ENOMEM); > um->version = version; > - um->ntab[UTF8_NFDI] = utf8nfdi(version); > - if (!um->ntab[UTF8_NFDI]) > + > + um->tables = symbol_request(utf8_data_table); > + if (!um->tables) > goto out_free_um; > - um->ntab[UTF8_NFDICF] = utf8nfdicf(version); > + > + if (!utf8version_is_supported(um, version)) > + goto out_symbol_put; > + um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata, > + um->tables->utf8nfdidata_size, um->version); > + if (!um->ntab[UTF8_NFDI]) > + goto out_symbol_put; > + um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata, > + um->tables->utf8nfdicfdata_size, um->version); > if (!um->ntab[UTF8_NFDICF]) > - goto out_free_um; > + goto out_symbol_put; > return um; > > +out_symbol_put: > + symbol_put(um->tables); > out_free_um: > kfree(um); > return ERR_PTR(-EINVAL); > @@ -187,6 +207,7 @@ EXPORT_SYMBOL(utf8_load); > > void utf8_unload(struct unicode_map *um) > { > + symbol_put(utf8_data_table); This triggers a BUG_ON if the symbol isn't loaded/loadable, i.e. ext4_fill_super fails early. I'm not sure how to fix it, though. Failed to find symbol utf8_data_table ------------[ cut here ]------------ kernel BUG at kernel/module.c:1022! invalid opcode: 0000 [#1] SMP CPU: 1 PID: 387 Comm: mount Not tainted 5.15.0-rc4-for-next_5.15 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 RIP: 0010:__symbol_put+0x88/0x90 Code: 84 c0 74 26 48 8b 7c 24 10 e8 44 f9 ff ff 65 ff 0d 1d 44 ea 7e 48 8b 44 24 30 65 48 33 04 25 28 00 00 00 75 07 48 83 c4 38 c3 <0f> 0b e8 51 ca a9 00 90 0f 1f 44 00 00 48 63 46 04 48 8d 74 RSP: 0018:ffffc90000623cc0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888102e91490 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88813b9d7860 RDI: ffff88813b9d7868 RBP: ffffc90000623de0 R08: 0000000000000000 R09: c0000000ffffefff R10: ffffc900006239d8 R11: ffffc900006239d0 R12: 00000000ffffffea R13: 0000000000000000 R14: ffff888102e94000 R15: ffff888102e91000 FS: 00007efcab508800(0000) GS:ffff88813b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff08eec56f4 CR3: 0000000102f31000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ext4_fill_super+0x289/0x32b0 ? bdev_name.isra.7+0x53/0xd0 ? vsnprintf+0x379/0x520 ? ext4_enable_quotas+0x260/0x260 ? mount_bdev+0x18a/0x1c0 ? ext4_enable_quotas+0x260/0x260 mount_bdev+0x18a/0x1c0 legacy_get_tree+0x30/0x50 vfs_get_tree+0x23/0x90 ? ns_capable_common+0x2b/0x50 path_mount+0x6da/0xa50 ? kmem_cache_free+0xf4/0x140 do_mount+0x75/0x90 __x64_sys_mount+0xc4/0xe0 do_syscall_64+0x3a/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7efcab71f6ba Code: 48 8b 0d b1 f7 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 7e f7 0b 00 f7 d8 64 89 RSP: 002b:00007ffefb824338 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 00007efcab873264 RCX: 00007efcab71f6ba RDX: 000055a2867dad10 RSI: 000055a2867d40f0 RDI: 000055a2867d40d0 RBP: 000055a2867d3ea0 R08: 0000000000000000 R09: 000055a2867d3010 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000055a2867d40d0 R14: 000055a2867dad10 R15: 000055a2867d3ea0 Modules linked in: ---[ end trace abcd43d820168730 ]--- > kfree(um); > } > EXPORT_SYMBOL(utf8_unload); > diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c > index 7c1f28ab31a80..829c7e2ad764a 100644 > --- a/fs/unicode/utf8-norm.c > +++ b/fs/unicode/utf8-norm.c > @@ -6,21 +6,12 @@ > > #include "utf8n.h" > > -struct utf8data { > - unsigned int maxage; > - unsigned int offset; > -}; > - > -#define __INCLUDED_FROM_UTF8NORM_C__ > -#include "utf8data.h" > -#undef __INCLUDED_FROM_UTF8NORM_C__ > - > -int utf8version_is_supported(unsigned int version) > +int utf8version_is_supported(const struct unicode_map *um, unsigned int version) > { > - int i = ARRAY_SIZE(utf8agetab) - 1; > + int i = um->tables->utf8agetab_size - 1; > > - while (i >= 0 && utf8agetab[i] != 0) { > - if (version == utf8agetab[i]) > + while (i >= 0 && um->tables->utf8agetab[i] != 0) { > + if (version == um->tables->utf8agetab[i]) > return 1; > i--; > } > @@ -161,7 +152,7 @@ typedef const unsigned char utf8trie_t; > * underlying datatype: unsigned char. > * > * leaf[0]: The unicode version, stored as a generation number that is > - * an index into utf8agetab[]. With this we can filter code > + * an index into ->utf8agetab[]. With this we can filter code > * points based on the unicode version in which they were > * defined. The CCC of a non-defined code point is 0. > * leaf[1]: Canonical Combining Class. During normalization, we need > @@ -313,7 +304,7 @@ static utf8leaf_t *utf8nlookup(const struct unicode_map *um, > enum utf8_normalization n, unsigned char *hangul, const char *s, > size_t len) > { > - utf8trie_t *trie = utf8data + um->ntab[n]->offset; > + utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; > int offlen; > int offset; > int mask; > @@ -404,7 +395,8 @@ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, > leaf = utf8nlookup(um, n, hangul, s, len); > if (!leaf) > return -1; > - if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage) > + if (um->tables->utf8agetab[LEAF_GEN(leaf)] > > + um->ntab[n]->maxage) > ret += utf8clen(s); > else if (LEAF_CCC(leaf) == DECOMPOSE) > ret += strlen(LEAF_STR(leaf)); > @@ -520,7 +512,7 @@ int utf8byte(struct utf8cursor *u8c) > > ccc = LEAF_CCC(leaf); > /* Characters that are too new have CCC 0. */ > - if (utf8agetab[LEAF_GEN(leaf)] > > + if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > > u8c->um->ntab[u8c->n]->maxage) { > ccc = STOPPER; > } else if (ccc == DECOMPOSE) { > @@ -597,25 +589,3 @@ int utf8byte(struct utf8cursor *u8c) > } > } > EXPORT_SYMBOL(utf8byte); > - > -const struct utf8data *utf8nfdi(unsigned int maxage) > -{ > - int i = ARRAY_SIZE(utf8nfdidata) - 1; > - > - while (maxage < utf8nfdidata[i].maxage) > - i--; > - if (maxage > utf8nfdidata[i].maxage) > - return NULL; > - return &utf8nfdidata[i]; > -} > - > -const struct utf8data *utf8nfdicf(unsigned int maxage) > -{ > - int i = ARRAY_SIZE(utf8nfdicfdata) - 1; > - > - while (maxage < utf8nfdicfdata[i].maxage) > - i--; > - if (maxage > utf8nfdicfdata[i].maxage) > - return NULL; > - return &utf8nfdicfdata[i]; > -} > diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c > index cfa3832b75f42..eb2bbdd688d71 100644 > --- a/fs/unicode/utf8-selftest.c > +++ b/fs/unicode/utf8-selftest.c > @@ -255,21 +255,21 @@ static void check_utf8_comparisons(struct unicode_map *table) > } > } > > -static void check_supported_versions(void) > +static void check_supported_versions(struct unicode_map *um) > { > /* Unicode 7.0.0 should be supported. */ > - test(utf8version_is_supported(UNICODE_AGE(7, 0, 0))); > + test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); > > /* Unicode 9.0.0 should be supported. */ > - test(utf8version_is_supported(UNICODE_AGE(9, 0, 0))); > + test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); > > /* Unicode 1x.0.0 (the latest version) should be supported. */ > - test(utf8version_is_supported(UTF8_LATEST)); > + test(utf8version_is_supported(um, UTF8_LATEST)); > > /* Next versions don't exist. */ > - test(!utf8version_is_supported(UNICODE_AGE(13, 0, 0))); > - test(!utf8version_is_supported(UNICODE_AGE(0, 0, 0))); > - test(!utf8version_is_supported(UNICODE_AGE(-1, -1, -1))); > + test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); > + test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); > + test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); > } > > static int __init init_test_ucd(void) > @@ -285,7 +285,7 @@ static int __init init_test_ucd(void) > return PTR_ERR(um); > } > > - check_supported_versions(); > + check_supported_versions(um); > check_utf8_nfdi(um); > check_utf8_nfdicf(um); > check_utf8_comparisons(um); > diff --git a/fs/unicode/utf8data.h_shipped b/fs/unicode/utf8data.c_shipped > similarity index 99% > rename from fs/unicode/utf8data.h_shipped > rename to fs/unicode/utf8data.c_shipped > index 76e4f0e1b0891..d9b62901aa96b 100644 > --- a/fs/unicode/utf8data.h_shipped > +++ b/fs/unicode/utf8data.c_shipped > @@ -1,9 +1,8 @@ > /* This file is generated code, do not edit. */ > -#ifndef __INCLUDED_FROM_UTF8NORM_C__ > -#error Only nls_utf8-norm.c should include this file. > -#endif > > -static const unsigned int utf8vers = 0xc0100; > +#include <linux/module.h> > +#include <linux/kernel.h> > +#include "utf8n.h" > > static const unsigned int utf8agetab[] = { > 0, > @@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = { > 0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00, > 0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00 > }; > + > +struct utf8data_table utf8_data_table = { > + .utf8agetab = utf8agetab, > + .utf8agetab_size = ARRAY_SIZE(utf8agetab), > + > + .utf8nfdicfdata = utf8nfdicfdata, > + .utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata), > + > + .utf8nfdidata = utf8nfdidata, > + .utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata), > + > + .utf8data = utf8data, > +}; > +EXPORT_SYMBOL_GPL(utf8_data_table); > +MODULE_LICENSE("GPL v2"); > diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h > index 206c89f0dbf71..bd00d587747a7 100644 > --- a/fs/unicode/utf8n.h > +++ b/fs/unicode/utf8n.h > @@ -13,25 +13,7 @@ > #include <linux/module.h> > #include <linux/unicode.h> > > -int utf8version_is_supported(unsigned int version); > - > -/* > - * Look for the correct const struct utf8data for a unicode version. > - * Returns NULL if the version requested is too new. > - * > - * Two normalization forms are supported: nfdi and nfdicf. > - * > - * nfdi: > - * - Apply unicode normalization form NFD. > - * - Remove any Default_Ignorable_Code_Point. > - * > - * nfdicf: > - * - Apply unicode normalization form NFD. > - * - Remove any Default_Ignorable_Code_Point. > - * - Apply a full casefold (C + F). > - */ > -extern const struct utf8data *utf8nfdi(unsigned int maxage); > -extern const struct utf8data *utf8nfdicf(unsigned int maxage); > +int utf8version_is_supported(const struct unicode_map *um, unsigned int version); > > /* > * Determine the length of the normalized from of the string, > @@ -78,4 +60,24 @@ int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, > */ > extern int utf8byte(struct utf8cursor *u8c); > > +struct utf8data { > + unsigned int maxage; > + unsigned int offset; > +}; > + > +struct utf8data_table { > + const unsigned int *utf8agetab; > + int utf8agetab_size; > + > + const struct utf8data *utf8nfdicfdata; > + int utf8nfdicfdata_size; > + > + const struct utf8data *utf8nfdidata; > + int utf8nfdidata_size; > + > + const unsigned char *utf8data; > +}; > + > +extern struct utf8data_table utf8_data_table; > + > #endif /* UTF8NORM_H */ > diff --git a/include/linux/unicode.h b/include/linux/unicode.h > index 526ca8b8391a5..4d39e6e11a950 100644 > --- a/include/linux/unicode.h > +++ b/include/linux/unicode.h > @@ -6,6 +6,7 @@ > #include <linux/dcache.h> > > struct utf8data; > +struct utf8data_table; > > #define UNICODE_MAJ_SHIFT 16 > #define UNICODE_MIN_SHIFT 8 > @@ -49,6 +50,7 @@ enum utf8_normalization { > struct unicode_map { > unsigned int version; > const struct utf8data *ntab[UTF8_NMAX]; > + const struct utf8data_table *tables; > }; > > int utf8_validate(const struct unicode_map *um, const struct qstr *str); -- Gabriel Krisman Bertazi