[PATCHi v2] fs: unicode: Add utf8-data module

Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx> · Wed, 18 Aug 2021 03:04:11 +0530

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
We can avoid carrying this large table in the kernel unless it is required
by the filesystem during boot process.

Hence, add utf8-data module which will be loaded only when UTF-8 encoding
support is needed by the filesystem, provided it is selected as M.
utf8-data will provide access to the data tables present in utf8data.h.

Also, add support for enabling utf8-data as a built-in option so that
filesystems that require UTF-8 encoding during boot process can access
the data tables without any failure.

Signed-off-by: Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx>
---
Changes in v2
 - Since there are no function pointer fields anymore, use utf8_data
as the name instead of utf8_ops
 - Remove unnecessary variable utf8data_loaded

 fs/unicode/Kconfig         | 23 +++++++++++--
 fs/unicode/Makefile        |  3 +-
 fs/unicode/utf8-core.c     | 50 ++++++++++++++++++++++++++--
 fs/unicode/utf8-data.c     | 42 +++++++++++++++++++++++
 fs/unicode/utf8-norm.c     | 68 ++++++++++++++++++++++----------------
 fs/unicode/utf8-selftest.c | 25 ++++++--------
 fs/unicode/utf8n.h         | 30 +++++++++++++++++
 7 files changed, 193 insertions(+), 48 deletions(-)
 create mode 100644 fs/unicode/utf8-data.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..80341fae5e63 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -2,13 +2,30 @@
 #
 # UTF-8 normalization
 #
+# This config option will be automatically selected when UNICODE_UTF8_DATA
+# is enabled. UNICODE config will provide all the UTF-8 core and normalization
+# functions which will use UTF-8 data tables.
 config UNICODE
 	bool "UTF-8 normalization and casefolding support"
+
+config UNICODE_UTF8_DATA
+	tristate "UTF-8 support for native Case-Insensitive filesystems"
+	select UNICODE
 	help
-	  Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
-	  support.
+	  Say M here to enable UTF-8 NFD normalization and NFD+CF casefolding
+	  support as a loadable module or say Y for building it into the kernel.
+	  It is currently supported by EXT4 and F2FS filesystems.
+
+	  utf8data.h_shipped has a large database table which is an
+	  auto-generated decodification trie for the unicode normalization
+	  functions. Enabling UNICODE_UTF8_DATA as M will allow you to avoid
+	  carrying this large table into the kernel and module will only be
+	  loaded with the data tables whenever required by any filesystem.
+	  If your filesystem requires to have the utf8-data during boot time
+	  then you should have it built into the kernel by saying Y here to
+	  avoid any boot failure.
 
 config UNICODE_NORMALIZATION_SELFTEST
 	tristate "Test UTF-8 normalization support"
-	depends on UNICODE
+	depends on UNICODE_UTF8_DATA
 	default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index b88aecc86550..fc28a6e2c56f 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -2,10 +2,11 @@
 
 obj-$(CONFIG_UNICODE) += unicode.o
 obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
+obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8-data.o
 
 unicode-y := utf8-norm.o utf8-core.o
 
-$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8-data.o: $(obj)/utf8data.h
 
 # In the normal build, the checked-in utf8data.h is just shipped.
 #
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index dc25823bfed9..4eb08385e680 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -192,7 +192,7 @@ static int utf8_parse_version(const char *version, unsigned int *maj,
 	return 0;
 }
 
-struct unicode_map *utf8_load(const char *version)
+static struct unicode_map *utf8_load_core(const char *version)
 {
 	struct unicode_map *um = NULL;
 	int unicode_version;
@@ -225,11 +225,57 @@ struct unicode_map *utf8_load(const char *version)
 
 	return um;
 }
+
+static void utf8_unload_core(struct unicode_map *um)
+{
+	kfree(um);
+}
+
+static int utf8mod_get(void)
+{
+	int ret;
+
+	spin_lock(&utf8_lock);
+	ret = utf8_data && try_module_get(utf8_data->owner);
+	spin_unlock(&utf8_lock);
+	return ret;
+}
+
+struct unicode_map *utf8_load(const char *version)
+{
+	struct unicode_map *um;
+
+	/*
+	 * try_then_request_module() is used here instead of using
+	 * request_module() because of the following problems that
+	 * could occur with the usage of request_module().
+	 * 1) Multiple calls in parallel to utf8_load() would fail if
+	 * kmod_concurrent_max == 0
+	 * 2) There would be unnecessary memory allocation and userspace
+	 * invocation in call_modprobe() that would always happen even if
+	 * the module is already loaded.
+	 * Hence, using try_then_request_module() would first check if the
+	 * module is already loaded, if not then it calls the request_module()
+	 * and finally would aquire the reference of the loaded module.
+	 */
+	if (!try_then_request_module(utf8mod_get(), "utf8-data")) {
+		pr_err("Failed to load UTF-8 module\n");
+		return ERR_PTR(-ENODEV);
+	}
+	um = utf8_load_core(version);
+	if (IS_ERR(um))
+		module_put(utf8_data->owner);
+
+	return um;
+}
 EXPORT_SYMBOL(utf8_load);
 
 void utf8_unload(struct unicode_map *um)
 {
-	kfree(um);
+	if (um) {
+		utf8_unload_core(um);
+		module_put(utf8_data->owner);
+	}
 }
 EXPORT_SYMBOL(utf8_unload);
 
diff --git a/fs/unicode/utf8-data.c b/fs/unicode/utf8-data.c
new file mode 100644
index 000000000000..1ae3c5dda6c7
--- /dev/null
+++ b/fs/unicode/utf8-data.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include "utf8n.h"
+
+#define __INCLUDED_FROM_UTF8NORM_C__
+#include "utf8data.h"
+#undef __INCLUDED_FROM_UTF8NORM_C__
+
+struct utf8data_table data = {
+	.owner = THIS_MODULE,
+
+	.utf8vers = utf8vers,
+
+	.utf8agetab = utf8agetab,
+	.utf8agetab_size = ARRAY_SIZE(utf8agetab),
+
+	.utf8nfdicfdata = utf8nfdicfdata,
+	.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),
+
+	.utf8nfdidata = utf8nfdidata,
+	.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),
+
+	.utf8data = utf8data,
+	.utf8data_size = ARRAY_SIZE(utf8data),
+};
+
+static int __init utf8_init(void)
+{
+	unicode_register(&data);
+	return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+	unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 1d2d2e5b906a..a6276f50a18f 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -6,22 +6,18 @@
 
 #include "utf8n.h"
 
-struct utf8data {
-	unsigned int maxage;
-	unsigned int offset;
-};
+/* Spinlock for protecting utf8_data */
+DEFINE_SPINLOCK(utf8_lock);
 
-#define __INCLUDED_FROM_UTF8NORM_C__
-#include "utf8data.h"
-#undef __INCLUDED_FROM_UTF8NORM_C__
+struct utf8data_table *utf8_data;
 
 int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 {
-	int i = ARRAY_SIZE(utf8agetab) - 1;
+	int i = utf8_data->utf8agetab_size - 1;
 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
 
-	while (i >= 0 && utf8agetab[i] != 0) {
-		if (sb_utf8version == utf8agetab[i])
+	while (i >= 0 && utf8_data->utf8agetab[i] != 0) {
+		if (sb_utf8version == utf8_data->utf8agetab[i])
 			return 1;
 		i--;
 	}
@@ -31,7 +27,7 @@ EXPORT_SYMBOL(utf8version_is_supported);
 
 int utf8version_latest(void)
 {
-	return utf8vers;
+	return utf8_data->utf8vers;
 }
 EXPORT_SYMBOL(utf8version_latest);
 
@@ -168,7 +164,7 @@ typedef const unsigned char utf8trie_t;
  * underlying datatype: unsigned char.
  *
  * leaf[0]: The unicode version, stored as a generation number that is
- *          an index into utf8agetab[].  With this we can filter code
+ *          an index into utf8_data->utf8agetab[].  With this we can filter code
  *          points based on the unicode version in which they were
  *          defined.  The CCC of a non-defined code point is 0.
  * leaf[1]: Canonical Combining Class. During normalization, we need
@@ -330,7 +326,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 	if (len == 0)
 		return NULL;
 
-	trie = utf8data + data->offset;
+	trie = utf8_data->utf8data + data->offset;
 	node = 1;
 	while (node) {
 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
@@ -418,7 +414,7 @@ int utf8agemax(const struct utf8data *data, const char *s)
 		if (!leaf)
 			return -1;
 
-		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		leaf_age = utf8_data->utf8agetab[LEAF_GEN(leaf)];
 		if (leaf_age <= data->maxage && leaf_age > age)
 			age = leaf_age;
 		s += utf8clen(s);
@@ -446,7 +442,7 @@ int utf8agemin(const struct utf8data *data, const char *s)
 		leaf = utf8lookup(data, hangul, s);
 		if (!leaf)
 			return -1;
-		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		leaf_age = utf8_data->utf8agetab[LEAF_GEN(leaf)];
 		if (leaf_age <= data->maxage && leaf_age < age)
 			age = leaf_age;
 		s += utf8clen(s);
@@ -473,7 +469,7 @@ int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 		leaf = utf8nlookup(data, hangul, s, len);
 		if (!leaf)
 			return -1;
-		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		leaf_age = utf8_data->utf8agetab[LEAF_GEN(leaf)];
 		if (leaf_age <= data->maxage && leaf_age > age)
 			age = leaf_age;
 		len -= utf8clen(s);
@@ -501,7 +497,7 @@ int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 		leaf = utf8nlookup(data, hangul, s, len);
 		if (!leaf)
 			return -1;
-		leaf_age = utf8agetab[LEAF_GEN(leaf)];
+		leaf_age = utf8_data->utf8agetab[LEAF_GEN(leaf)];
 		if (leaf_age <= data->maxage && leaf_age < age)
 			age = leaf_age;
 		len -= utf8clen(s);
@@ -529,7 +525,7 @@ ssize_t utf8len(const struct utf8data *data, const char *s)
 		leaf = utf8lookup(data, hangul, s);
 		if (!leaf)
 			return -1;
-		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+		if (utf8_data->utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 			ret += utf8clen(s);
 		else if (LEAF_CCC(leaf) == DECOMPOSE)
 			ret += strlen(LEAF_STR(leaf));
@@ -557,7 +553,7 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 		leaf = utf8nlookup(data, hangul, s, len);
 		if (!leaf)
 			return -1;
-		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
+		if (utf8_data->utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 			ret += utf8clen(s);
 		else if (LEAF_CCC(leaf) == DECOMPOSE)
 			ret += strlen(LEAF_STR(leaf));
@@ -690,7 +686,7 @@ int utf8byte(struct utf8cursor *u8c)
 
 		ccc = LEAF_CCC(leaf);
 		/* Characters that are too new have CCC 0. */
-		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
+		if (utf8_data->utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 			ccc = STOPPER;
 		} else if (ccc == DECOMPOSE) {
 			u8c->len -= utf8clen(u8c->s);
@@ -769,24 +765,40 @@ EXPORT_SYMBOL(utf8byte);
 
 const struct utf8data *utf8nfdi(unsigned int maxage)
 {
-	int i = ARRAY_SIZE(utf8nfdidata) - 1;
+	int i = utf8_data->utf8nfdidata_size - 1;
 
-	while (maxage < utf8nfdidata[i].maxage)
+	while (maxage < utf8_data->utf8nfdidata[i].maxage)
 		i--;
-	if (maxage > utf8nfdidata[i].maxage)
+	if (maxage > utf8_data->utf8nfdidata[i].maxage)
 		return NULL;
-	return &utf8nfdidata[i];
+	return &utf8_data->utf8nfdidata[i];
 }
 EXPORT_SYMBOL(utf8nfdi);
 
 const struct utf8data *utf8nfdicf(unsigned int maxage)
 {
-	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
+	int i = utf8_data->utf8nfdicfdata_size - 1;
 
-	while (maxage < utf8nfdicfdata[i].maxage)
+	while (maxage < utf8_data->utf8nfdicfdata[i].maxage)
 		i--;
-	if (maxage > utf8nfdicfdata[i].maxage)
+	if (maxage > utf8_data->utf8nfdicfdata[i].maxage)
 		return NULL;
-	return &utf8nfdicfdata[i];
+	return &utf8_data->utf8nfdicfdata[i];
 }
 EXPORT_SYMBOL(utf8nfdicf);
+
+void unicode_register(struct utf8data_table *data)
+{
+	spin_lock(&utf8_lock);
+	utf8_data = data;
+	spin_unlock(&utf8_lock);
+}
+EXPORT_SYMBOL(unicode_register);
+
+void unicode_unregister(void)
+{
+	spin_lock(&utf8_lock);
+	utf8_data = NULL;
+	spin_unlock(&utf8_lock);
+}
+EXPORT_SYMBOL(unicode_unregister);
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 6fe8af7edccb..d8069f4ad452 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -16,6 +16,7 @@
 
 unsigned int failed_tests;
 unsigned int total_tests;
+struct unicode_map *table;
 
 /* Tests will be based on this version. */
 #define latest_maj 12
@@ -232,16 +233,9 @@ static void check_utf8_nfdicf(void)
 	}
 }
 
-static void check_utf8_comparisons(void)
+static void check_utf8_comparisons(struct unicode_map *table)
 {
 	int i;
-	struct unicode_map *table = utf8_load("12.1.0");
-
-	if (IS_ERR(table)) {
-		pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
-		       __func__, latest_maj, latest_min, latest_rev);
-		return;
-	}
 
 	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
 		const struct qstr s1 = {.name = nfdi_test_data[i].str,
@@ -262,8 +256,6 @@ static void check_utf8_comparisons(void)
 		test_f(!utf8_strncasecmp(table, &s1, &s2),
 		       "%s %s comparison mismatch\n", s1.name, s2.name);
 	}
-
-	utf8_unload(table);
 }
 
 static void check_supported_versions(void)
@@ -274,9 +266,6 @@ static void check_supported_versions(void)
 	/* Unicode 9.0.0 should be supported. */
 	test(utf8version_is_supported(9, 0, 0));
 
-	/* Unicode 1x.0.0 (the latest version) should be supported. */
-	test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
-
 	/* Next versions don't exist. */
 	test(!utf8version_is_supported(13, 0, 0));
 	test(!utf8version_is_supported(0, 0, 0));
@@ -288,10 +277,17 @@ static int __init init_test_ucd(void)
 	failed_tests = 0;
 	total_tests = 0;
 
+	table = utf8_load("12.1.0");
+	if (IS_ERR(table)) {
+		pr_err("%s: Unable to load utf8 %d.%d.%d. Could not run the tests\n",
+		       __func__, latest_maj, latest_min, latest_rev);
+		return -EINVAL;
+	}
+
 	check_supported_versions();
 	check_utf8_nfdi();
 	check_utf8_nfdicf();
-	check_utf8_comparisons();
+	check_utf8_comparisons(table);
 
 	if (!failed_tests)
 		pr_info("All %u tests passed\n", total_tests);
@@ -303,6 +299,7 @@ static int __init init_test_ucd(void)
 
 static void __exit exit_test_ucd(void)
 {
+	utf8_unload(table);
 }
 
 module_init(init_test_ucd);
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index 0acd530c2c79..eb73fee9efc4 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -11,6 +11,7 @@
 #include <linux/export.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/spinlock.h>
 
 /* Encoding a unicode version number as a single unsigned int. */
 #define UNICODE_MAJ_SHIFT		(16)
@@ -21,6 +22,9 @@
 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
 	 ((unsigned int)(REV)))
 
+extern spinlock_t utf8_lock;
+extern struct utf8data_table *utf8_data;
+
 /* Highest unicode version supported by the data tables. */
 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
 extern int utf8version_latest(void);
@@ -105,4 +109,30 @@ extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
  */
 extern int utf8byte(struct utf8cursor *u8c);
 
+struct utf8data {
+	unsigned int maxage;
+	unsigned int offset;
+};
+
+struct utf8data_table {
+	struct module *owner;
+
+	const unsigned int utf8vers;
+
+	const unsigned int *utf8agetab;
+	int utf8agetab_size;
+
+	const struct utf8data *utf8nfdicfdata;
+	int utf8nfdicfdata_size;
+
+	const struct utf8data *utf8nfdidata;
+	int utf8nfdidata_size;
+
+	const unsigned char *utf8data;
+	int utf8data_size;
+};
+
+void unicode_register(struct utf8data_table *data);
+void unicode_unregister(void);
+
 #endif /* UTF8NORM_H */
-- 
2.30.2