[PATCH 7/7] module: add debug stats to help identify memory pressure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Loading modules with finit_module() can end up using vmalloc(), vmap()
and vmalloc() again, for a total of up to 3 separate allocations in the
worse case for a single module. We always kernel_read*() the module,
that's a vmalloc(). Then vmap() is used for the module decompression,
and if so the last read buffer is freed as we use the now decompressed
module buffer to stuff data into our copy module. The last one is
specific to architectures but pretty much that's generally a series
of vmalloc() for different ELF sections...

Evaluation with new stress-ng module support [1] with just 100 ops
us proving that you can end up using GiBs of data easily even if
we are trying to be very careful not to load modules which are already
loaded. 100 ops seems to resemble the sort of pressure a system with
about 400 CPUs can create on modules. Although those issues for so
many concurrent loads per CPU is silly and are being fixed, we lack
proper tooling to help diagnose easily what happened, when it happened
and what likely are the culprits -- userspace or kernel module
autoloading.

Provide an initial set of stats for debugfs which let us easily scrape
post-boot information about failed loads. This sort of information can
be used on production worklaods to try to optimize *avoiding* redundant
memory pressure using finit_module().

Screen shot:

root@kmod ~ # cat /sys/kernel/debug/modules/stats
           Modules loaded       67
        Total module size       11464704
      Total mod text size       4194304
       Failed kread bytes       890064
        Failed kmod bytes       890064
      Invalid kread bytes       890064
 Invalid decompress bytes       0
        Invalid mod bytes       890064
         Average mod size       171115
    Average mod text size       62602
Failed modules:
                kvm_intel
                      kvm
                irqbypass
         crct10dif_pclmul
      ghash_clmulni_intel
             sha512_ssse3
           sha512_generic
              aesni_intel
              crypto_simd
                   cryptd
                    evdev
                serio_raw
               virtio_pci
                     nvme
                nvme_core
    virtio_pci_legacy_dev
                   t10_pi
           crc64_rocksoft
    virtio_pci_modern_dev
             crc32_pclmul
                   virtio
             crc32c_intel
              virtio_ring
                    crc64

[0] https://github.com/ColinIanKing/stress-ng.git
[1] echo 0 > /proc/sys/vm/oom_dump_tasks
    ./stress-ng --module 100 --module-name xfs

Signed-off-by: Luis Chamberlain <mcgrof@xxxxxxxxxx>
---
 kernel/module/Kconfig    |  32 +++++++
 kernel/module/Makefile   |   4 +
 kernel/module/debug.c    |  16 ++++
 kernel/module/internal.h |  35 +++++++
 kernel/module/main.c     |  45 ++++++++-
 kernel/module/stats.c    | 200 +++++++++++++++++++++++++++++++++++++++
 kernel/module/tracking.c |   7 +-
 7 files changed, 331 insertions(+), 8 deletions(-)
 create mode 100644 kernel/module/debug.c
 create mode 100644 kernel/module/stats.c

diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 424b3bc58f3f..fbf7b92cb3d0 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -22,6 +22,12 @@ menuconfig MODULES
 
 if MODULES
 
+config MODULE_DEBUG
+	bool "Module debugging"
+	default n
+	help
+	  Allows to help debug module functionality.
+
 config MODULE_FORCE_LOAD
 	bool "Forced module loading"
 	default n
@@ -48,6 +54,8 @@ config MODULE_FORCE_UNLOAD
 	  rmmod).  This is mainly for kernel developers and desperate users.
 	  If unsure, say N.
 
+if MODULE_DEBUG
+
 config MODULE_UNLOAD_TAINT_TRACKING
 	bool "Tainted module unload tracking"
 	depends on MODULE_UNLOAD
@@ -59,6 +67,30 @@ config MODULE_UNLOAD_TAINT_TRACKING
 	  page (see bad_page()), the aforementioned details are also
 	  shown. If unsure, say N.
 
+config MODULE_STATS
+	bool "Module statistics"
+	depends on DEBUG_FS
+	default n
+	help
+	  This option allows you to maintain a record of module statistics.
+	  For example each all modules size, average size, text size, and
+	  failed modules and the size for each of those. For failed
+	  modules we keep track of module which failed due to either the
+	  existing module taking too long to load or that module already
+	  was loaded.
+
+	  You should enable this if you are debugging production loads
+	  and want to see if userspace or the kernel is doing stupid things
+	  with loading modules when it shouldn't or if you want to help
+	  optimize userspace / kernel space module autoloading schemes.
+	  You might want to do this because failed modules tend to use
+	  use up significan amount of memory, and so you'd be doing everyone
+	  a favor in avoiding these failure proactively.
+
+	  If unsure, say N.
+
+endif # MODULE_DEBUG
+
 config MODVERSIONS
 	bool "Module versioning support"
 	help
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 5b1d26b53b8d..fe97047f3807 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -20,4 +20,8 @@ obj-$(CONFIG_PROC_FS) += procfs.o
 obj-$(CONFIG_SYSFS) += sysfs.o
 obj-$(CONFIG_KGDB_KDB) += kdb.o
 obj-$(CONFIG_MODVERSIONS) += version.o
+
+# Link order matters here, keep debug.o first.
+obj-$(CONFIG_MODULE_DEBUG) += debug.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
 obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
diff --git a/kernel/module/debug.c b/kernel/module/debug.c
new file mode 100644
index 000000000000..ef580d70b751
--- /dev/null
+++ b/kernel/module/debug.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@xxxxxxxxxx>
+ */
+
+#include <linux/debugfs.h>
+#include "internal.h"
+
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+	mod_debugfs_root = debugfs_create_dir("modules", NULL);
+	return 0;
+}
+module_init(module_debugfs_init);
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 6ae29bb8836f..a645cb3fafc7 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -143,6 +143,41 @@ static inline bool set_livepatch_module(struct module *mod)
 #endif
 }
 
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add64(count, var) atomic64_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic64_t total_mod_size;
+extern atomic64_t total_text_size;
+extern atomic64_t invalid_kread_bytes;
+extern atomic64_t invalid_decompress_bytes;
+extern atomic64_t invalid_mod_becoming_bytes;
+extern atomic64_t invalid_mod_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+extern atomic_t failed_load_modules;
+struct mod_fail_load {
+	struct list_head list;
+	char name[MODULE_NAME_LEN];
+	atomic_t count;
+};
+
+int try_add_failed_module(const char *name);
+
+#else
+
+#define mod_stat_inc64(name)
+#define mod_stat_inc(name) atomic_inc(name)
+
+static inline int try_add_failed_module(const char *name)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULE_STATS */
+
 #ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
 struct mod_unload_taint {
 	struct list_head list;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 145e15f19576..8b851042b7f9 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2495,6 +2495,16 @@ static noinline int do_init_module(struct module *mod)
 {
 	int ret = 0;
 	struct mod_initfree *freeinit;
+	unsigned int text_size = 0, total_size = 0;
+
+	for_each_mod_mem_type(type) {
+		const struct module_memory *mod_mem = &mod->mem[type];
+		if (mod_mem->size) {
+			total_size += mod_mem->size;
+			if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+				text_size += mod->mem[type].size;
+		}
+	}
 
 	freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
 	if (!freeinit) {
@@ -2556,6 +2566,7 @@ static noinline int do_init_module(struct module *mod)
 		mod->mem[type].base = NULL;
 		mod->mem[type].size = 0;
 	}
+
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
 	mod->btf_data = NULL;
@@ -2579,6 +2590,11 @@ static noinline int do_init_module(struct module *mod)
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
+	mod_stat_add64(text_size, &total_text_size);
+	mod_stat_add64(total_size, &total_mod_size);
+
+	mod_stat_inc(&modcount);
+
 	return 0;
 
 fail_free_freeinit:
@@ -2594,6 +2610,7 @@ static noinline int do_init_module(struct module *mod)
 	ftrace_release_mod(mod);
 	free_module(mod);
 	wake_up_all(&module_wq);
+
 	return ret;
 }
 
@@ -2650,6 +2667,9 @@ static int module_patient_check_exists(const char *name)
 		old = find_module_all(name, strlen(name), true);
 	}
 
+	if (try_add_failed_module(name))
+		pr_warn("Could not add fail-tracking for module: %s\n", name);
+
 	/*
 	 * We are here only when the same module was being loaded. Do
 	 * not try to load it again right now. It prevents long delays
@@ -2800,6 +2820,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		       int flags)
 {
 	struct module *mod;
+	bool module_allocated = false;
 	long err = 0;
 	char *after_dashes;
 
@@ -2839,6 +2860,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_copy;
 	}
 
+	module_allocated = true;
+
 	audit_log_kern_module(mod->name);
 
 	/* Reserve our place in the list. */
@@ -2983,6 +3006,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	synchronize_rcu();
 	mutex_unlock(&module_mutex);
  free_module:
+	mod_stat_add64(info->len, &invalid_mod_bytes);
 	/* Free lock-classes; relies on the preceding sync_rcu() */
 	for_class_mod_mem_type(type, core_data) {
 		lockdep_free_key_range(mod->mem[type].base,
@@ -2991,6 +3015,13 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	module_deallocate(mod, info);
  free_copy:
+	/*
+	 * The info->len is always set. We distinguish between
+	 * failures once the proper module was allocated and
+	 * before that.
+	 */
+	if (!module_allocated)
+		mod_stat_add64(info->len, &invalid_mod_becoming_bytes);
 	free_copy(info, flags);
 	return err;
 }
@@ -3009,8 +3040,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	       umod, len, uargs);
 
 	err = copy_module_from_user(umod, len, &info);
-	if (err)
+	if (err) {
+		mod_stat_inc(&failed_kreads);
 		return err;
+	}
 
 	return load_module(&info, uargs, 0);
 }
@@ -3035,14 +3068,20 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
 
 	len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
 				       READING_MODULE);
-	if (len < 0)
+	if (len < 0) {
+		mod_stat_inc(&failed_kreads);
+		mod_stat_add64(len, &invalid_kread_bytes);
 		return len;
+	}
 
 	if (flags & MODULE_INIT_COMPRESSED_FILE) {
 		err = module_decompress(&info, buf, len);
 		vfree(buf); /* compressed data is no longer needed */
-		if (err)
+		if (err) {
+			mod_stat_inc(&failed_decompress);
+			mod_stat_add64(len, &invalid_decompress_bytes);
 			return err;
+		}
 	} else {
 		info.hdr = buf;
 		info.len = len;
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..bbf0ddb8b589
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Let's keep keep tabs on a few important module stats, useful
+ * for debugging production loads and interactions between userspace
+ * and kernelspace for loading modules.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@xxxxxxxxxx>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+
+#include "internal.h"
+
+extern struct dentry *mod_debugfs_root;
+
+/*
+ * Tracks modules which failed to be loaded as they were being processed.
+ * These require modulesed consumed vmalloc space for all finit_module()
+ * calls as kernel_read*() is used. Then if compression is used vmap()
+ * is used to allocate space for the decompressed version of what userspace
+ * has on the filesystem, we vfree() the compressed data which kerne_read*()
+ * fetched for us. Finally, a final module is allocated as well which we
+ * use to keep around, and that *can* use vmalloc() too.
+ *
+ * In the worst case, when module compression is used then we use the vmap
+ * space three times.
+ *
+ * We really should strive to get this list to be empty. This not being empty
+ * is a reflection of us needing to do more work to ensure either the kernel
+ * or usersapce does not do unnecessary calls to load modules which it should
+ * know are already loaded or on its way to be loaded.
+ */
+static LIST_HEAD(failed_modules);
+
+/* Total bytes used by all modules we've dealt with on this system */
+atomic64_t total_mod_size;
+
+/* Total .text section sizes we've dealt with on this system */
+atomic64_t total_text_size;
+
+/* Failures happen on the initial kernel_read_*() call. They use vmalloc() */
+atomic64_t invalid_kread_bytes;
+
+/* Failures happen on the module decompression path, these use use vmap(). */
+atomic64_t invalid_decompress_bytes;
+
+/*
+ * The invalid_mod_becoming_bytes only keeps tabs of failures in between kread
+ * success and right before we allocate the module to process it. These
+ * can be failures due to:
+ *
+ *  o module_sig_check() - module signature checks
+ *  o elf_validity_cache_copy() - ELF does not add up
+ *  o early_mod_check():
+ *  	- blacklist
+ *  	- failed to rewrite section headers
+ *  	- verion magic
+ *  	- live patch requirements didn't check out
+ *  	- the module was detected as being already present, this
+ *  	  first check avoids a new vmalloc for the full size of
+ *  	  the module.
+ */
+atomic64_t invalid_mod_becoming_bytes;
+
+/*
+ * These are failures after we did all the sanity checks of the
+ * module userspace passed to us. This can still fail if we detect
+ * the module is loaded, we do this check after we allocate space
+ * for the module.
+ */
+atomic64_t invalid_mod_bytes;
+
+/* How many modules we've loaded in our kernel life time */
+atomic_t modcount;
+
+/* How many modules failed due to failed kernel_read*() */
+atomic_t failed_kreads;
+
+/* How many failed decompression attempts we've had */
+atomic_t failed_decompress;
+
+/* How many modules failed once we've allocated space for our module */
+atomic_t failed_load_modules;
+
+int try_add_failed_module(const char *name)
+{
+	struct mod_fail_load *mod_fail;
+
+	list_for_each_entry_rcu(mod_fail, &failed_modules, list,
+				lockdep_is_held(&module_mutex)) {
+                if (!strcmp(mod_fail->name, name)) {
+                        atomic_inc(&mod_fail->count);
+                        goto out;
+                }
+        }
+
+	mod_fail = kmalloc(sizeof(*mod_fail), GFP_KERNEL);
+	if (!mod_fail)
+		return -ENOMEM;
+	strscpy(mod_fail->name, name, MODULE_NAME_LEN);
+        atomic_inc(&mod_fail->count);
+        list_add_rcu(&mod_fail->list, &failed_modules);
+out:
+	return 0;
+}
+
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+				   size_t count, loff_t *ppos)
+{
+	struct mod_fail_load *mod_fail;
+	unsigned int len;
+	const unsigned int size = 1024;
+	char *buf;
+	u32 live_mod_count, fkreads, floads;
+	u64 total_size, text_size, ikread_bytes, idecomp_bytes, imod_bytes;
+
+	live_mod_count = atomic_read(&modcount);
+	fkreads = atomic_read(&failed_kreads);
+	floads = atomic_read(&failed_load_modules);
+
+	total_size = atomic64_read(&total_mod_size);
+	text_size = atomic64_read(&total_text_size);
+	ikread_bytes = atomic64_read(&invalid_mod_bytes);
+	idecomp_bytes = atomic64_read(&invalid_decompress_bytes);
+	imod_bytes = atomic64_read(&invalid_mod_bytes);
+
+	buf = kzalloc(size, GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Modules loaded", live_mod_count);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total module size", total_size);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total mod text size", text_size);
+
+	/*
+	 * Failed kmod bytes do not contain any failed kreads bytes as those
+	 * failures would happen earlier on kread. Failed kread bytes are wasted
+	 * vmalloc() space allocations and are indicative of invalid modules.
+	 */
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kread bytes", ikread_bytes);
+
+	/*
+	 * Failed kmod bytes are modules which for whatever reason fail to load
+	 * on the load_module() effort. They are good signs the kernel or userspace
+	 * is doing something stupid or that could be improved.
+	 */
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kmod bytes", imod_bytes);
+
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid kread bytes", ikread_bytes);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid decompress bytes", idecomp_bytes);
+	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Invalid mod bytes", imod_bytes);
+
+	if (live_mod_count && total_size) {
+		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod size",
+				 DIV_ROUND_UP(total_size, live_mod_count));
+	}
+
+	if (live_mod_count && text_size) {
+		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod text size",
+				 DIV_ROUND_UP(text_size, live_mod_count));
+	}
+
+	if (list_empty(&failed_modules))
+		goto out;
+
+	len += scnprintf(buf + len, size - len, "Failed modules:\n");
+	list_for_each_entry_rcu(mod_fail, &failed_modules, list)
+		len += scnprintf(buf + len, size - len, "%25s\n", mod_fail->name);
+out:
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static const struct file_operations fops_mod_stats = {
+	.read = read_file_mod_stats,
+	.open = simple_open,
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
+};
+
+static int __init module_stats_init(void)
+{
+	debugfs_create_atomic64_t("total_mod_size", 0400, mod_debugfs_root, &total_mod_size);
+	debugfs_create_atomic64_t("total_text_size", 0400, mod_debugfs_root, &total_text_size);
+	debugfs_create_atomic64_t("invalid_kread_bytes", 0400, mod_debugfs_root, &invalid_kread_bytes);
+	debugfs_create_atomic64_t("invalid_decompress_bytes", 0400, mod_debugfs_root, &invalid_decompress_bytes);
+	debugfs_create_atomic64_t("invalid_mod_bytes", 0400, mod_debugfs_root, &invalid_mod_bytes);
+	debugfs_create_atomic_t("modcount", 0400, mod_debugfs_root, &modcount);
+	debugfs_create_atomic_t("failed_kreads", 0400, mod_debugfs_root, &failed_kreads);
+	debugfs_create_atomic_t("failed_load_modules", 0400, mod_debugfs_root, &failed_load_modules);
+	debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+	return 0;
+}
+module_init(module_stats_init);
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
index 26d812e07615..cbeb9330db9b 100644
--- a/kernel/module/tracking.c
+++ b/kernel/module/tracking.c
@@ -15,6 +15,7 @@
 #include "internal.h"
 
 static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root
 
 int try_add_tainted_module(struct module *mod)
 {
@@ -120,12 +121,8 @@ static const struct file_operations unloaded_tainted_modules_fops = {
 
 static int __init unloaded_tainted_modules_init(void)
 {
-	struct dentry *dir;
-
-	dir = debugfs_create_dir("modules", NULL);
-	debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
+	debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
 			    &unloaded_tainted_modules_fops);
-
 	return 0;
 }
 module_init(unloaded_tainted_modules_init);
-- 
2.39.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux