[RFC PATCH v2 13/25] mm: memlayout+dnuma: add debugfs interface

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add a debugfs interface to dnuma/memlayout. It keeps track of a
variable backlog of memory layouts, provides some statistics on dnuma
moved pages & cache performance, and allows the setting of a new global
memlayout.

TODO: split out statistics, backlog, & write interfaces from eachother.

Signed-off-by: Cody P Schafer <cody@xxxxxxxxxxxxxxxxxx>
---
 include/linux/dnuma.h     |   2 +-
 include/linux/memlayout.h |   7 +
 mm/Kconfig                |  30 ++++
 mm/Makefile               |   1 +
 mm/dnuma.c                |   4 +-
 mm/memlayout-debugfs.c    | 339 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/memlayout-debugfs.h    |  39 ++++++
 mm/memlayout.c            |  20 ++-
 8 files changed, 436 insertions(+), 6 deletions(-)
 create mode 100644 mm/memlayout-debugfs.c
 create mode 100644 mm/memlayout-debugfs.h

diff --git a/include/linux/dnuma.h b/include/linux/dnuma.h
index 029a984..7a33131 100644
--- a/include/linux/dnuma.h
+++ b/include/linux/dnuma.h
@@ -64,7 +64,7 @@ static inline int dnuma_page_needs_move(struct page *page)
 	return new_nid;
 }
 
-void dnuma_post_free_to_new_zone(struct page *page, int order);
+void dnuma_post_free_to_new_zone(int order);
 void dnuma_prior_free_to_new_zone(struct page *page, int order,
 				  struct zone *dest_zone,
 				  int dest_nid);
diff --git a/include/linux/memlayout.h b/include/linux/memlayout.h
index 6c26c52..14dbf35 100644
--- a/include/linux/memlayout.h
+++ b/include/linux/memlayout.h
@@ -56,6 +56,7 @@ struct memlayout {
 };
 
 extern __rcu struct memlayout *pfn_to_node_map;
+extern struct mutex memlayout_lock; /* update-side lock */
 
 /* FIXME: overflow potential in completion check */
 #define ml_for_each_pfn_in_range(rme, pfn)	\
@@ -90,7 +91,13 @@ static inline struct rangemap_entry *rme_first(struct memlayout *ml)
 	     rme = rme_next(rme))
 
 struct memlayout *memlayout_create(enum memlayout_type);
+
+/*
+ * In most cases, these should only be used by the memlayout debugfs code (or
+ * internally within memlayout)
+ */
 void memlayout_destroy(struct memlayout *ml);
+void memlayout_destroy_mem(struct memlayout *ml);
 
 int memlayout_new_range(struct memlayout *ml,
 		unsigned long pfn_start, unsigned long pfn_end, int nid);
diff --git a/mm/Kconfig b/mm/Kconfig
index 86f0984..3820b3c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,6 +193,36 @@ config DYNAMIC_NUMA
 	 Choose Y if you have are running linux under a hypervisor that uses
 	 this feature, otherwise choose N if unsure.
 
+config DNUMA_DEBUGFS
+	bool "Export DNUMA & memlayout internals via debugfs"
+	depends on DYNAMIC_NUMA
+	help
+	 Export some dynamic numa info via debugfs in <debugfs>/memlayout.
+
+	 Enables the tracking and export of statistics and the exporting of the
+	 current memory layout.
+
+	 If you are not debugging Dynamic NUMA or memlayout, choose N.
+
+config DNUMA_BACKLOG
+	int "Number of old memlayouts to keep (0 = None, -1 = unlimited)"
+	depends on DNUMA_DEBUGFS
+	help
+	 Allows access to old memory layouts & statistics in debugfs.
+
+	 Each memlayout will consume some memory, and when set to -1
+	 (unlimited), this can result in unbounded kernel memory use.
+
+config DNUMA_DEBUGFS_WRITE
+	bool "Change NUMA layout via debugfs"
+	depends on DNUMA_DEBUGFS
+	help
+	 Enable the use of <debugfs>/memlayout/{start,end,node,commit}
+
+	 Write a PFN to 'start' & 'end', then a node id to 'node'.
+	 Repeat this until you are satisfied with your memory layout, then
+	 write '1' to 'commit'.
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 82fe7c9b..b07926c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -59,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_DYNAMIC_NUMA) += dnuma.o memlayout.o
+obj-$(CONFIG_DNUMA_DEBUGFS) += memlayout-debugfs.o
diff --git a/mm/dnuma.c b/mm/dnuma.c
index 2ee0903..eb00b7b 100644
--- a/mm/dnuma.c
+++ b/mm/dnuma.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 
 #include "internal.h"
+#include "memlayout-debugfs.h"
 
 /* - must be called under lock_memory_hotplug() */
 /* TODO: avoid iterating over all PFNs. */
@@ -117,8 +118,9 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 }
 #endif
 
-void dnuma_post_free_to_new_zone(struct page *page, int order)
+void dnuma_post_free_to_new_zone(int order)
 {
+	ml_stat_count_moved_pages(order);
 }
 
 static void dnuma_prior_return_to_new_zone(struct page *page, int order,
diff --git a/mm/memlayout-debugfs.c b/mm/memlayout-debugfs.c
new file mode 100644
index 0000000..a4fc2cb
--- /dev/null
+++ b/mm/memlayout-debugfs.c
@@ -0,0 +1,339 @@
+#include <linux/debugfs.h>
+
+#include <linux/slab.h> /* kmalloc */
+#include <linux/module.h> /* THIS_MODULE, needed for DEFINE_SIMPLE_ATTR */
+
+#include "memlayout-debugfs.h"
+
+#if CONFIG_DNUMA_BACKLOG > 0
+/* Fixed size backlog */
+#include <linux/kfifo.h>
+#include <linux/log2.h> /* roundup_pow_of_two */
+DEFINE_KFIFO(ml_backlog, struct memlayout *,
+		roundup_pow_of_two(CONFIG_DNUMA_BACKLOG));
+void ml_backlog_feed(struct memlayout *ml)
+{
+	if (kfifo_is_full(&ml_backlog)) {
+		struct memlayout *old_ml;
+		BUG_ON(!kfifo_get(&ml_backlog, &old_ml));
+		memlayout_destroy(old_ml);
+	}
+
+	kfifo_put(&ml_backlog, (const struct memlayout **)&ml);
+}
+#elif CONFIG_DNUMA_BACKLOG < 0
+/* Unlimited backlog */
+void ml_backlog_feed(struct memlayout *ml)
+{
+	/* we never use the rme_tree, so we destroy the non-debugfs portions to
+	 * save memory */
+	memlayout_destroy_mem(ml);
+}
+#else /* CONFIG_DNUMA_BACKLOG == 0 */
+/* No backlog */
+void ml_backlog_feed(struct memlayout *ml)
+{
+	memlayout_destroy(ml);
+}
+#endif
+
+static atomic64_t dnuma_moved_page_ct;
+void ml_stat_count_moved_pages(int order)
+{
+	atomic64_add(1 << order, &dnuma_moved_page_ct);
+}
+
+static atomic_t ml_seq = ATOMIC_INIT(0);
+static struct dentry *root_dentry, *current_dentry;
+#define ML_LAYOUT_NAME_SZ \
+	((size_t)(DIV_ROUND_UP(sizeof(unsigned) * 8, 3) \
+				+ 1 + strlen("layout.")))
+#define ML_REGION_NAME_SZ ((size_t)(2 * BITS_PER_LONG / 4 + 2))
+
+static void ml_layout_name(struct memlayout *ml, char *name)
+{
+	sprintf(name, "layout.%u", ml->seq);
+}
+
+static int dfs_range_get(void *data, u64 *val)
+{
+	*val = (uintptr_t)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(range_fops, dfs_range_get, NULL, "%lld\n");
+
+static void _ml_dbgfs_create_range(struct dentry *base,
+		struct rangemap_entry *rme, char *name)
+{
+	struct dentry *rd;
+	sprintf(name, "%05lx-%05lx", rme->pfn_start, rme->pfn_end);
+	rd = debugfs_create_file(name, 0400, base,
+				(void *)(uintptr_t)rme->nid, &range_fops);
+	if (!rd)
+		pr_devel("debugfs: failed to create "RME_FMT"\n",
+				RME_EXP(rme));
+	else
+		pr_devel("debugfs: created "RME_FMT"\n", RME_EXP(rme));
+}
+
+/* Must be called with memlayout_lock held */
+static void _ml_dbgfs_set_current(struct memlayout *ml, char *name)
+{
+	ml_layout_name(ml, name);
+	debugfs_remove(current_dentry);
+	current_dentry = debugfs_create_symlink("current", root_dentry, name);
+}
+
+static void ml_dbgfs_create_layout_assume_root(struct memlayout *ml)
+{
+	char name[ML_LAYOUT_NAME_SZ];
+	ml_layout_name(ml, name);
+	WARN_ON(!root_dentry);
+	ml->d = debugfs_create_dir(name, root_dentry);
+	WARN_ON(!ml->d);
+}
+
+# if defined(CONFIG_DNUMA_DEBUGFS_WRITE)
+
+#define DEFINE_DEBUGFS_GET(___type)					\
+	static int debugfs_## ___type ## _get(void *data, u64 *val)	\
+	{								\
+		*val = *(___type *)data;				\
+		return 0;						\
+	}
+
+DEFINE_DEBUGFS_GET(u32);
+DEFINE_DEBUGFS_GET(u8);
+
+#define DEFINE_WATCHED_ATTR(___type, ___var)			\
+	static int ___var ## _watch_set(void *data, u64 val)	\
+	{							\
+		___type old_val = *(___type *)data;		\
+		int ret = ___var ## _watch(old_val, val);	\
+		if (!ret)					\
+			*(___type *)data = val;			\
+		return ret;					\
+	}							\
+	DEFINE_SIMPLE_ATTRIBUTE(___var ## _fops,		\
+			debugfs_ ## ___type ## _get,		\
+			___var ## _watch_set, "%llu\n");
+
+#define DEFINE_ACTION_ATTR(___name)
+
+static u64 dnuma_user_start;
+static u64 dnuma_user_end;
+static u32 dnuma_user_node; /* XXX: I don't care about this var, remove? */
+static u8  dnuma_user_commit, dnuma_user_clear; /* same here */
+static struct memlayout *user_ml;
+static DEFINE_MUTEX(dnuma_user_lock);
+static int dnuma_user_node_watch(u32 old_val, u32 new_val)
+{
+	int ret = 0;
+	mutex_lock(&dnuma_user_lock);
+	if (!user_ml)
+		user_ml = memlayout_create(ML_USER_DEBUG);
+
+	if (WARN_ON(!user_ml)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (new_val >= nr_node_ids) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (dnuma_user_start > dnuma_user_end) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = memlayout_new_range(user_ml, dnuma_user_start, dnuma_user_end,
+				  new_val);
+
+	if (!ret) {
+		dnuma_user_start = 0;
+		dnuma_user_end = 0;
+	}
+out:
+	mutex_unlock(&dnuma_user_lock);
+	return ret;
+}
+
+static int dnuma_user_commit_watch(u8 old_val, u8 new_val)
+{
+	mutex_lock(&dnuma_user_lock);
+	if (user_ml)
+		memlayout_commit(user_ml);
+	user_ml = NULL;
+	mutex_unlock(&dnuma_user_lock);
+	return 0;
+}
+
+static int dnuma_user_clear_watch(u8 old_val, u8 new_val)
+{
+	mutex_lock(&dnuma_user_lock);
+	if (user_ml)
+		memlayout_destroy(user_ml);
+	user_ml = NULL;
+	mutex_unlock(&dnuma_user_lock);
+	return 0;
+}
+
+DEFINE_WATCHED_ATTR(u32, dnuma_user_node);
+DEFINE_WATCHED_ATTR(u8, dnuma_user_commit);
+DEFINE_WATCHED_ATTR(u8, dnuma_user_clear);
+# endif /* defined(CONFIG_DNUMA_DEBUGFS_WRITE) */
+
+/* create the entire current memlayout.
+ * only used for the layout which exsists prior to fs initialization
+ */
+static void ml_dbgfs_create_initial_layout(void)
+{
+	struct rangemap_entry *rme;
+	char name[max(ML_REGION_NAME_SZ, ML_LAYOUT_NAME_SZ)];
+	struct memlayout *old_ml, *new_ml;
+
+	new_ml = kmalloc(sizeof(*new_ml), GFP_KERNEL);
+	if (WARN(!new_ml, "memlayout allocation failed\n"))
+		return;
+
+	mutex_lock(&memlayout_lock);
+
+	old_ml = rcu_dereference_protected(pfn_to_node_map,
+			mutex_is_locked(&memlayout_lock));
+	if (WARN_ON(!old_ml))
+		goto e_out;
+	*new_ml = *old_ml;
+
+	if (WARN_ON(new_ml->d))
+		goto e_out;
+
+	/* this assumption holds as ml_dbgfs_create_initial_layout() (this
+	 * function) is only called by ml_dbgfs_create_root() */
+	ml_dbgfs_create_layout_assume_root(new_ml);
+	if (!new_ml->d)
+		goto e_out;
+
+	ml_for_each_range(new_ml, rme) {
+		_ml_dbgfs_create_range(new_ml->d, rme, name);
+	}
+
+	_ml_dbgfs_set_current(new_ml, name);
+	rcu_assign_pointer(pfn_to_node_map, new_ml);
+	mutex_unlock(&memlayout_lock);
+
+	synchronize_rcu();
+	kfree(old_ml);
+	return;
+e_out:
+	mutex_unlock(&memlayout_lock);
+	kfree(new_ml);
+}
+
+static atomic64_t ml_cache_hits;
+static atomic64_t ml_cache_misses;
+
+void ml_stat_cache_miss(void)
+{
+	atomic64_inc(&ml_cache_misses);
+}
+
+void ml_stat_cache_hit(void)
+{
+	atomic64_inc(&ml_cache_hits);
+}
+
+/* returns 0 if root_dentry has been created */
+static int ml_dbgfs_create_root(void)
+{
+	if (root_dentry)
+		return 0;
+
+	if (!debugfs_initialized()) {
+		pr_devel("debugfs not registered or disabled.\n");
+		return -EINVAL;
+	}
+
+	root_dentry = debugfs_create_dir("memlayout", NULL);
+	if (!root_dentry) {
+		pr_devel("root dir creation failed\n");
+		return -EINVAL;
+	}
+
+	/* TODO: place in a different dir? (to keep memlayout & dnuma seperate)
+	 */
+	/* FIXME: use debugfs_create_atomic64() [does not yet exsist]. */
+	debugfs_create_u64("moved-pages", 0400, root_dentry,
+			   (uint64_t *)&dnuma_moved_page_ct.counter);
+	debugfs_create_u64("pfn-lookup-cache-misses", 0400, root_dentry,
+			   (uint64_t *)&ml_cache_misses.counter);
+	debugfs_create_u64("pfn-lookup-cache-hits", 0400, root_dentry,
+			   (uint64_t *)&ml_cache_hits.counter);
+
+# if defined(CONFIG_DNUMA_DEBUGFS_WRITE)
+	/* Set node last: on write, it adds the range. */
+	debugfs_create_x64("start", 0600, root_dentry, &dnuma_user_start);
+	debugfs_create_x64("end",   0600, root_dentry, &dnuma_user_end);
+	debugfs_create_file("node",  0200, root_dentry,
+			&dnuma_user_node, &dnuma_user_node_fops);
+	debugfs_create_file("commit",  0200, root_dentry,
+			&dnuma_user_commit, &dnuma_user_commit_fops);
+	debugfs_create_file("clear",  0200, root_dentry,
+			&dnuma_user_clear, &dnuma_user_clear_fops);
+# endif
+
+	/* uses root_dentry */
+	ml_dbgfs_create_initial_layout();
+
+	return 0;
+}
+
+static void ml_dbgfs_create_layout(struct memlayout *ml)
+{
+	if (ml_dbgfs_create_root()) {
+		ml->d = NULL;
+		return;
+	}
+	ml_dbgfs_create_layout_assume_root(ml);
+}
+
+static int ml_dbgfs_init_root(void)
+{
+	ml_dbgfs_create_root();
+	return 0;
+}
+
+void ml_dbgfs_init(struct memlayout *ml)
+{
+	ml->seq = atomic_inc_return(&ml_seq) - 1;
+	ml_dbgfs_create_layout(ml);
+}
+
+void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme)
+{
+	char name[ML_REGION_NAME_SZ];
+	if (ml->d)
+		_ml_dbgfs_create_range(ml->d, rme, name);
+}
+
+void ml_dbgfs_set_current(struct memlayout *ml)
+{
+	char name[ML_LAYOUT_NAME_SZ];
+	_ml_dbgfs_set_current(ml, name);
+}
+
+void ml_destroy_dbgfs(struct memlayout *ml)
+{
+	if (ml && ml->d)
+		debugfs_remove_recursive(ml->d);
+}
+
+static void __exit ml_dbgfs_exit(void)
+{
+	debugfs_remove_recursive(root_dentry);
+	root_dentry = NULL;
+}
+
+module_init(ml_dbgfs_init_root);
+module_exit(ml_dbgfs_exit);
diff --git a/mm/memlayout-debugfs.h b/mm/memlayout-debugfs.h
new file mode 100644
index 0000000..12dc1eb
--- /dev/null
+++ b/mm/memlayout-debugfs.h
@@ -0,0 +1,39 @@
+#ifndef LINUX_MM_MEMLAYOUT_DEBUGFS_H_
+#define LINUX_MM_MEMLAYOUT_DEBUGFS_H_
+
+#include <linux/memlayout.h>
+
+#ifdef CONFIG_DNUMA_DEBUGFS
+void ml_stat_count_moved_pages(int order);
+void ml_stat_cache_hit(void);
+void ml_stat_cache_miss(void);
+void ml_dbgfs_init(struct memlayout *ml);
+void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme);
+void ml_destroy_dbgfs(struct memlayout *ml);
+void ml_dbgfs_set_current(struct memlayout *ml);
+void ml_backlog_feed(struct memlayout *ml);
+#else /* !defined(CONFIG_DNUMA_DEBUGFS) */
+static inline void ml_stat_count_moved_pages(int order)
+{}
+static inline void ml_stat_cache_hit(void)
+{}
+static inline void ml_stat_cache_miss(void)
+{}
+
+static inline void ml_dbgfs_init(struct memlayout *ml)
+{}
+static inline void ml_dbgfs_create_range(struct memlayout *ml,
+		struct rangemap_entry *rme)
+{}
+static inline void ml_destroy_dbgfs(struct memlayout *ml)
+{}
+static inline void ml_dbgfs_set_current(struct memlayout *ml)
+{}
+
+static inline void ml_backlog_feed(struct memlayout *ml)
+{
+	memlayout_destroy(ml);
+}
+#endif
+
+#endif
diff --git a/mm/memlayout.c b/mm/memlayout.c
index 7d2905b..45e7df6 100644
--- a/mm/memlayout.c
+++ b/mm/memlayout.c
@@ -14,6 +14,8 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
+#include "memlayout-debugfs.h"
+
 /* protected by memlayout_lock */
 __rcu struct memlayout *pfn_to_node_map;
 DEFINE_MUTEX(memlayout_lock);
@@ -26,7 +28,7 @@ static void free_rme_tree(struct rb_root *root)
 	}
 }
 
-static void ml_destroy_mem(struct memlayout *ml)
+void memlayout_destroy_mem(struct memlayout *ml)
 {
 	if (!ml)
 		return;
@@ -88,6 +90,8 @@ int memlayout_new_range(struct memlayout *ml, unsigned long pfn_start,
 
 	rb_link_node(&rme->node, parent, new);
 	rb_insert_color(&rme->node, &ml->root);
+
+	ml_dbgfs_create_range(ml, rme);
 	return 0;
 }
 
@@ -104,9 +108,12 @@ int memlayout_pfn_to_nid(unsigned long pfn)
 	rme = ACCESS_ONCE(ml->cache);
 	if (rme && rme_bounds_pfn(rme, pfn)) {
 		rcu_read_unlock();
+		ml_stat_cache_hit();
 		return rme->nid;
 	}
 
+	ml_stat_cache_miss();
+
 	node = ml->root.rb_node;
 	while (node) {
 		struct rangemap_entry *rme = rb_entry(node, typeof(*rme), node);
@@ -135,7 +142,8 @@ out:
 
 void memlayout_destroy(struct memlayout *ml)
 {
-	ml_destroy_mem(ml);
+	ml_destroy_dbgfs(ml);
+	memlayout_destroy_mem(ml);
 }
 
 struct memlayout *memlayout_create(enum memlayout_type type)
@@ -153,6 +161,7 @@ struct memlayout *memlayout_create(enum memlayout_type type)
 	ml->type = type;
 	ml->cache = NULL;
 
+	ml_dbgfs_init(ml);
 	return ml;
 }
 
@@ -163,12 +172,12 @@ void memlayout_commit(struct memlayout *ml)
 	if (ml->type == ML_INITIAL) {
 		if (WARN(dnuma_has_memlayout(),
 				"memlayout marked first is not first, ignoring.\n")) {
-			memlayout_destroy(ml);
 			ml_backlog_feed(ml);
 			return;
 		}
 
 		mutex_lock(&memlayout_lock);
+		ml_dbgfs_set_current(ml);
 		rcu_assign_pointer(pfn_to_node_map, ml);
 		mutex_unlock(&memlayout_lock);
 		return;
@@ -179,13 +188,16 @@ void memlayout_commit(struct memlayout *ml)
 	unlock_memory_hotplug();
 
 	mutex_lock(&memlayout_lock);
+
+	ml_dbgfs_set_current(ml);
+
 	old_ml = rcu_dereference_protected(pfn_to_node_map,
 			mutex_is_locked(&memlayout_lock));
 
 	rcu_assign_pointer(pfn_to_node_map, ml);
 
 	synchronize_rcu();
-	memlayout_destroy(old_ml);
+	ml_backlog_feed(old_ml);
 
 	/* Must be called only after the new value for pfn_to_node_map has
 	 * propogated to all tasks, otherwise some pages may lookup the old
-- 
1.8.2.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]