Add a debugfs interface to dnuma/memlayout. It keeps track of a variable backlog of memory layouts, provides some statistics on dnuma moved pages & cache performance, and allows the setting of a new global memlayout. TODO: split out statistics, backlog, & write interfaces from eachother. Signed-off-by: Cody P Schafer <cody@xxxxxxxxxxxxxxxxxx> --- include/linux/dnuma.h | 2 +- include/linux/memlayout.h | 7 + mm/Kconfig | 30 ++++ mm/Makefile | 1 + mm/dnuma.c | 4 +- mm/memlayout-debugfs.c | 339 ++++++++++++++++++++++++++++++++++++++++++++++ mm/memlayout-debugfs.h | 39 ++++++ mm/memlayout.c | 23 +++- 8 files changed, 438 insertions(+), 7 deletions(-) create mode 100644 mm/memlayout-debugfs.c create mode 100644 mm/memlayout-debugfs.h diff --git a/include/linux/dnuma.h b/include/linux/dnuma.h index 029a984..7a33131 100644 --- a/include/linux/dnuma.h +++ b/include/linux/dnuma.h @@ -64,7 +64,7 @@ static inline int dnuma_page_needs_move(struct page *page) return new_nid; } -void dnuma_post_free_to_new_zone(struct page *page, int order); +void dnuma_post_free_to_new_zone(int order); void dnuma_prior_free_to_new_zone(struct page *page, int order, struct zone *dest_zone, int dest_nid); diff --git a/include/linux/memlayout.h b/include/linux/memlayout.h index adab685..c09ecdb 100644 --- a/include/linux/memlayout.h +++ b/include/linux/memlayout.h @@ -56,6 +56,7 @@ struct memlayout { }; extern __rcu struct memlayout *pfn_to_node_map; +extern struct mutex memlayout_lock; /* update-side lock */ /* FIXME: overflow potential in completion check */ #define ml_for_each_pfn_in_range(rme, pfn) \ @@ -90,7 +91,13 @@ static inline struct rangemap_entry *rme_first(struct memlayout *ml) rme = rme_next(rme)) struct memlayout *memlayout_create(enum memlayout_type); + +/* + * In most cases, these should only be used by the memlayout debugfs code (or + * internally within memlayout) + */ void memlayout_destroy(struct memlayout *ml); +void memlayout_destroy_mem(struct memlayout *ml); int memlayout_new_range(struct memlayout *ml, unsigned long pfn_start, unsigned long pfn_end, int nid); diff --git a/mm/Kconfig b/mm/Kconfig index bfbe300..3ddf6e3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -193,6 +193,36 @@ config DYNAMIC_NUMA Choose Y if you have are running linux under a hypervisor that uses this feature, otherwise choose N if unsure. +config DNUMA_DEBUGFS + bool "Export DNUMA & memlayout internals via debugfs" + depends on DYNAMIC_NUMA + help + Export some dynamic numa info via debugfs in <debugfs>/memlayout. + + Enables the tracking and export of statistics and the exporting of the + current memory layout. + + If you are not debugging Dynamic NUMA or memlayout, choose N. + +config DNUMA_BACKLOG + int "Number of old memlayouts to keep (0 = None, -1 = unlimited)" + depends on DNUMA_DEBUGFS + help + Allows access to old memory layouts & statistics in debugfs. + + Each memlayout will consume some memory, and when set to -1 + (unlimited), this can result in unbounded kernel memory use. + +config DNUMA_DEBUGFS_WRITE + bool "Change NUMA layout via debugfs" + depends on DNUMA_DEBUGFS + help + Enable the use of <debugfs>/memlayout/{start,end,node,commit} + + Write a PFN to 'start' & 'end', then a node id to 'node'. + Repeat this until you are satisfied with your memory layout, then + write '1' to 'commit'. + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" diff --git a/mm/Makefile b/mm/Makefile index c538e1e..7ce2b26 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -59,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_DYNAMIC_NUMA) += dnuma.o memlayout.o +obj-$(CONFIG_DNUMA_DEBUGFS) += memlayout-debugfs.o diff --git a/mm/dnuma.c b/mm/dnuma.c index 2b6e13e..7ee77a0 100644 --- a/mm/dnuma.c +++ b/mm/dnuma.c @@ -11,6 +11,7 @@ #include <linux/types.h> #include "internal.h" +#include "memlayout-debugfs.h" /* - must be called under lock_memory_hotplug() */ /* TODO: avoid iterating over all PFNs. */ @@ -113,8 +114,9 @@ static void node_states_set_node(int node, struct memory_notify *arg) } #endif -void dnuma_post_free_to_new_zone(struct page *page, int order) +void dnuma_post_free_to_new_zone(int order) { + ml_stat_count_moved_pages(order); } static void dnuma_prior_return_to_new_zone(struct page *page, int order, diff --git a/mm/memlayout-debugfs.c b/mm/memlayout-debugfs.c new file mode 100644 index 0000000..a4fc2cb --- /dev/null +++ b/mm/memlayout-debugfs.c @@ -0,0 +1,339 @@ +#include <linux/debugfs.h> + +#include <linux/slab.h> /* kmalloc */ +#include <linux/module.h> /* THIS_MODULE, needed for DEFINE_SIMPLE_ATTR */ + +#include "memlayout-debugfs.h" + +#if CONFIG_DNUMA_BACKLOG > 0 +/* Fixed size backlog */ +#include <linux/kfifo.h> +#include <linux/log2.h> /* roundup_pow_of_two */ +DEFINE_KFIFO(ml_backlog, struct memlayout *, + roundup_pow_of_two(CONFIG_DNUMA_BACKLOG)); +void ml_backlog_feed(struct memlayout *ml) +{ + if (kfifo_is_full(&ml_backlog)) { + struct memlayout *old_ml; + BUG_ON(!kfifo_get(&ml_backlog, &old_ml)); + memlayout_destroy(old_ml); + } + + kfifo_put(&ml_backlog, (const struct memlayout **)&ml); +} +#elif CONFIG_DNUMA_BACKLOG < 0 +/* Unlimited backlog */ +void ml_backlog_feed(struct memlayout *ml) +{ + /* we never use the rme_tree, so we destroy the non-debugfs portions to + * save memory */ + memlayout_destroy_mem(ml); +} +#else /* CONFIG_DNUMA_BACKLOG == 0 */ +/* No backlog */ +void ml_backlog_feed(struct memlayout *ml) +{ + memlayout_destroy(ml); +} +#endif + +static atomic64_t dnuma_moved_page_ct; +void ml_stat_count_moved_pages(int order) +{ + atomic64_add(1 << order, &dnuma_moved_page_ct); +} + +static atomic_t ml_seq = ATOMIC_INIT(0); +static struct dentry *root_dentry, *current_dentry; +#define ML_LAYOUT_NAME_SZ \ + ((size_t)(DIV_ROUND_UP(sizeof(unsigned) * 8, 3) \ + + 1 + strlen("layout."))) +#define ML_REGION_NAME_SZ ((size_t)(2 * BITS_PER_LONG / 4 + 2)) + +static void ml_layout_name(struct memlayout *ml, char *name) +{ + sprintf(name, "layout.%u", ml->seq); +} + +static int dfs_range_get(void *data, u64 *val) +{ + *val = (uintptr_t)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(range_fops, dfs_range_get, NULL, "%lld\n"); + +static void _ml_dbgfs_create_range(struct dentry *base, + struct rangemap_entry *rme, char *name) +{ + struct dentry *rd; + sprintf(name, "%05lx-%05lx", rme->pfn_start, rme->pfn_end); + rd = debugfs_create_file(name, 0400, base, + (void *)(uintptr_t)rme->nid, &range_fops); + if (!rd) + pr_devel("debugfs: failed to create "RME_FMT"\n", + RME_EXP(rme)); + else + pr_devel("debugfs: created "RME_FMT"\n", RME_EXP(rme)); +} + +/* Must be called with memlayout_lock held */ +static void _ml_dbgfs_set_current(struct memlayout *ml, char *name) +{ + ml_layout_name(ml, name); + debugfs_remove(current_dentry); + current_dentry = debugfs_create_symlink("current", root_dentry, name); +} + +static void ml_dbgfs_create_layout_assume_root(struct memlayout *ml) +{ + char name[ML_LAYOUT_NAME_SZ]; + ml_layout_name(ml, name); + WARN_ON(!root_dentry); + ml->d = debugfs_create_dir(name, root_dentry); + WARN_ON(!ml->d); +} + +# if defined(CONFIG_DNUMA_DEBUGFS_WRITE) + +#define DEFINE_DEBUGFS_GET(___type) \ + static int debugfs_## ___type ## _get(void *data, u64 *val) \ + { \ + *val = *(___type *)data; \ + return 0; \ + } + +DEFINE_DEBUGFS_GET(u32); +DEFINE_DEBUGFS_GET(u8); + +#define DEFINE_WATCHED_ATTR(___type, ___var) \ + static int ___var ## _watch_set(void *data, u64 val) \ + { \ + ___type old_val = *(___type *)data; \ + int ret = ___var ## _watch(old_val, val); \ + if (!ret) \ + *(___type *)data = val; \ + return ret; \ + } \ + DEFINE_SIMPLE_ATTRIBUTE(___var ## _fops, \ + debugfs_ ## ___type ## _get, \ + ___var ## _watch_set, "%llu\n"); + +#define DEFINE_ACTION_ATTR(___name) + +static u64 dnuma_user_start; +static u64 dnuma_user_end; +static u32 dnuma_user_node; /* XXX: I don't care about this var, remove? */ +static u8 dnuma_user_commit, dnuma_user_clear; /* same here */ +static struct memlayout *user_ml; +static DEFINE_MUTEX(dnuma_user_lock); +static int dnuma_user_node_watch(u32 old_val, u32 new_val) +{ + int ret = 0; + mutex_lock(&dnuma_user_lock); + if (!user_ml) + user_ml = memlayout_create(ML_USER_DEBUG); + + if (WARN_ON(!user_ml)) { + ret = -ENOMEM; + goto out; + } + + if (new_val >= nr_node_ids) { + ret = -EINVAL; + goto out; + } + + if (dnuma_user_start > dnuma_user_end) { + ret = -EINVAL; + goto out; + } + + ret = memlayout_new_range(user_ml, dnuma_user_start, dnuma_user_end, + new_val); + + if (!ret) { + dnuma_user_start = 0; + dnuma_user_end = 0; + } +out: + mutex_unlock(&dnuma_user_lock); + return ret; +} + +static int dnuma_user_commit_watch(u8 old_val, u8 new_val) +{ + mutex_lock(&dnuma_user_lock); + if (user_ml) + memlayout_commit(user_ml); + user_ml = NULL; + mutex_unlock(&dnuma_user_lock); + return 0; +} + +static int dnuma_user_clear_watch(u8 old_val, u8 new_val) +{ + mutex_lock(&dnuma_user_lock); + if (user_ml) + memlayout_destroy(user_ml); + user_ml = NULL; + mutex_unlock(&dnuma_user_lock); + return 0; +} + +DEFINE_WATCHED_ATTR(u32, dnuma_user_node); +DEFINE_WATCHED_ATTR(u8, dnuma_user_commit); +DEFINE_WATCHED_ATTR(u8, dnuma_user_clear); +# endif /* defined(CONFIG_DNUMA_DEBUGFS_WRITE) */ + +/* create the entire current memlayout. + * only used for the layout which exsists prior to fs initialization + */ +static void ml_dbgfs_create_initial_layout(void) +{ + struct rangemap_entry *rme; + char name[max(ML_REGION_NAME_SZ, ML_LAYOUT_NAME_SZ)]; + struct memlayout *old_ml, *new_ml; + + new_ml = kmalloc(sizeof(*new_ml), GFP_KERNEL); + if (WARN(!new_ml, "memlayout allocation failed\n")) + return; + + mutex_lock(&memlayout_lock); + + old_ml = rcu_dereference_protected(pfn_to_node_map, + mutex_is_locked(&memlayout_lock)); + if (WARN_ON(!old_ml)) + goto e_out; + *new_ml = *old_ml; + + if (WARN_ON(new_ml->d)) + goto e_out; + + /* this assumption holds as ml_dbgfs_create_initial_layout() (this + * function) is only called by ml_dbgfs_create_root() */ + ml_dbgfs_create_layout_assume_root(new_ml); + if (!new_ml->d) + goto e_out; + + ml_for_each_range(new_ml, rme) { + _ml_dbgfs_create_range(new_ml->d, rme, name); + } + + _ml_dbgfs_set_current(new_ml, name); + rcu_assign_pointer(pfn_to_node_map, new_ml); + mutex_unlock(&memlayout_lock); + + synchronize_rcu(); + kfree(old_ml); + return; +e_out: + mutex_unlock(&memlayout_lock); + kfree(new_ml); +} + +static atomic64_t ml_cache_hits; +static atomic64_t ml_cache_misses; + +void ml_stat_cache_miss(void) +{ + atomic64_inc(&ml_cache_misses); +} + +void ml_stat_cache_hit(void) +{ + atomic64_inc(&ml_cache_hits); +} + +/* returns 0 if root_dentry has been created */ +static int ml_dbgfs_create_root(void) +{ + if (root_dentry) + return 0; + + if (!debugfs_initialized()) { + pr_devel("debugfs not registered or disabled.\n"); + return -EINVAL; + } + + root_dentry = debugfs_create_dir("memlayout", NULL); + if (!root_dentry) { + pr_devel("root dir creation failed\n"); + return -EINVAL; + } + + /* TODO: place in a different dir? (to keep memlayout & dnuma seperate) + */ + /* FIXME: use debugfs_create_atomic64() [does not yet exsist]. */ + debugfs_create_u64("moved-pages", 0400, root_dentry, + (uint64_t *)&dnuma_moved_page_ct.counter); + debugfs_create_u64("pfn-lookup-cache-misses", 0400, root_dentry, + (uint64_t *)&ml_cache_misses.counter); + debugfs_create_u64("pfn-lookup-cache-hits", 0400, root_dentry, + (uint64_t *)&ml_cache_hits.counter); + +# if defined(CONFIG_DNUMA_DEBUGFS_WRITE) + /* Set node last: on write, it adds the range. */ + debugfs_create_x64("start", 0600, root_dentry, &dnuma_user_start); + debugfs_create_x64("end", 0600, root_dentry, &dnuma_user_end); + debugfs_create_file("node", 0200, root_dentry, + &dnuma_user_node, &dnuma_user_node_fops); + debugfs_create_file("commit", 0200, root_dentry, + &dnuma_user_commit, &dnuma_user_commit_fops); + debugfs_create_file("clear", 0200, root_dentry, + &dnuma_user_clear, &dnuma_user_clear_fops); +# endif + + /* uses root_dentry */ + ml_dbgfs_create_initial_layout(); + + return 0; +} + +static void ml_dbgfs_create_layout(struct memlayout *ml) +{ + if (ml_dbgfs_create_root()) { + ml->d = NULL; + return; + } + ml_dbgfs_create_layout_assume_root(ml); +} + +static int ml_dbgfs_init_root(void) +{ + ml_dbgfs_create_root(); + return 0; +} + +void ml_dbgfs_init(struct memlayout *ml) +{ + ml->seq = atomic_inc_return(&ml_seq) - 1; + ml_dbgfs_create_layout(ml); +} + +void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme) +{ + char name[ML_REGION_NAME_SZ]; + if (ml->d) + _ml_dbgfs_create_range(ml->d, rme, name); +} + +void ml_dbgfs_set_current(struct memlayout *ml) +{ + char name[ML_LAYOUT_NAME_SZ]; + _ml_dbgfs_set_current(ml, name); +} + +void ml_destroy_dbgfs(struct memlayout *ml) +{ + if (ml && ml->d) + debugfs_remove_recursive(ml->d); +} + +static void __exit ml_dbgfs_exit(void) +{ + debugfs_remove_recursive(root_dentry); + root_dentry = NULL; +} + +module_init(ml_dbgfs_init_root); +module_exit(ml_dbgfs_exit); diff --git a/mm/memlayout-debugfs.h b/mm/memlayout-debugfs.h new file mode 100644 index 0000000..12dc1eb --- /dev/null +++ b/mm/memlayout-debugfs.h @@ -0,0 +1,39 @@ +#ifndef LINUX_MM_MEMLAYOUT_DEBUGFS_H_ +#define LINUX_MM_MEMLAYOUT_DEBUGFS_H_ + +#include <linux/memlayout.h> + +#ifdef CONFIG_DNUMA_DEBUGFS +void ml_stat_count_moved_pages(int order); +void ml_stat_cache_hit(void); +void ml_stat_cache_miss(void); +void ml_dbgfs_init(struct memlayout *ml); +void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme); +void ml_destroy_dbgfs(struct memlayout *ml); +void ml_dbgfs_set_current(struct memlayout *ml); +void ml_backlog_feed(struct memlayout *ml); +#else /* !defined(CONFIG_DNUMA_DEBUGFS) */ +static inline void ml_stat_count_moved_pages(int order) +{} +static inline void ml_stat_cache_hit(void) +{} +static inline void ml_stat_cache_miss(void) +{} + +static inline void ml_dbgfs_init(struct memlayout *ml) +{} +static inline void ml_dbgfs_create_range(struct memlayout *ml, + struct rangemap_entry *rme) +{} +static inline void ml_destroy_dbgfs(struct memlayout *ml) +{} +static inline void ml_dbgfs_set_current(struct memlayout *ml) +{} + +static inline void ml_backlog_feed(struct memlayout *ml) +{ + memlayout_destroy(ml); +} +#endif + +#endif diff --git a/mm/memlayout.c b/mm/memlayout.c index 132dbff..0a1a602 100644 --- a/mm/memlayout.c +++ b/mm/memlayout.c @@ -14,6 +14,8 @@ #include <linux/rcupdate.h> #include <linux/slab.h> +#include "memlayout-debugfs.h" + /* protected by memlayout_lock */ __rcu struct memlayout *pfn_to_node_map; DEFINE_MUTEX(memlayout_lock); @@ -26,7 +28,7 @@ static void free_rme_tree(struct rb_root *root) } } -static void ml_destroy_mem(struct memlayout *ml) +void memlayout_destroy_mem(struct memlayout *ml) { if (!ml) return; @@ -88,6 +90,8 @@ int memlayout_new_range(struct memlayout *ml, unsigned long pfn_start, rb_link_node(&rme->node, parent, new); rb_insert_color(&rme->node, &ml->root); + + ml_dbgfs_create_range(ml, rme); return 0; } @@ -114,8 +118,12 @@ struct rangemap_entry *memlayout_pfn_to_rme_higher(struct memlayout *ml, unsigne rme = ACCESS_ONCE(ml->cache); smp_read_barrier_depends(); - if (rme && rme_bounds_pfn(rme, pfn)) + if (rme && rme_bounds_pfn(rme, pfn)) { + ml_stat_cache_hit(); return rme; + } + + ml_stat_cache_miss(); node = ml->root.rb_node; while (node) { @@ -217,7 +225,8 @@ static void memlayout_expand(struct memlayout *ml) void memlayout_destroy(struct memlayout *ml) { - ml_destroy_mem(ml); + ml_destroy_dbgfs(ml); + memlayout_destroy_mem(ml); } struct memlayout *memlayout_create(enum memlayout_type type) @@ -235,6 +244,7 @@ struct memlayout *memlayout_create(enum memlayout_type type) ml->type = type; ml->cache = NULL; + ml_dbgfs_init(ml); return ml; } @@ -246,12 +256,12 @@ void memlayout_commit(struct memlayout *ml) if (ml->type == ML_INITIAL) { if (WARN(dnuma_has_memlayout(), "memlayout marked first is not first, ignoring.\n")) { - memlayout_destroy(ml); ml_backlog_feed(ml); return; } mutex_lock(&memlayout_lock); + ml_dbgfs_set_current(ml); rcu_assign_pointer(pfn_to_node_map, ml); mutex_unlock(&memlayout_lock); return; @@ -264,13 +274,16 @@ void memlayout_commit(struct memlayout *ml) unlock_memory_hotplug(); mutex_lock(&memlayout_lock); + + ml_dbgfs_set_current(ml); + old_ml = rcu_dereference_protected(pfn_to_node_map, mutex_is_locked(&memlayout_lock)); rcu_assign_pointer(pfn_to_node_map, ml); synchronize_rcu(); - memlayout_destroy(old_ml); + ml_backlog_feed(old_ml); /* Must be called only after the new value for pfn_to_node_map has * propogated to all tasks, otherwise some pages may lookup the old -- 1.8.2.2 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>