On Mon, May 23, 2022 at 11:24:10AM -0700, Roman Gushchin wrote: > On Sun, May 22, 2022 at 06:36:56PM +0800, Muchun Song wrote: > > On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote: > > > This commit introduces the /sys/kernel/debug/shrinker debugfs > > > interface which provides an ability to observe the state of > > > individual kernel memory shrinkers. > > > > > > Because the feature adds some memory overhead (which shouldn't be > > > large unless there is a huge amount of registered shrinkers), it's > > > guarded by a config option (enabled by default). > > > > > > This commit introduces the "count" interface for each shrinker > > > registered in the system. > > > > > > The output is in the following format: > > > > Hi Roman, > > Hi Muchun! > > Thank you for taking a look! > > > > > Shoud we print a title to show what those numbers mean? In this case, > > it is more understandable. > > No, I don't think so: this interface is not supposed to be used by > an average user and those who will be using it can refer to the provided > documentation. Printing the header each time will add some overhead for > no good reason. > Got it. Make sense. > > > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>... > > > <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>... > > > ... > > > > > > To reduce the size of output on machines with many thousands cgroups, > > > if the total number of objects on all nodes is 0, the line is omitted. > > > > > > If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is > > > printed as cgroup inode id. If the shrinker is not numa-aware, 0's are > > > printed for all nodes except the first one. > > > > > > This commit gives debugfs entries simple numeric names, which are not > > > very convenient. The following commit in the series will provide > > > shrinkers with more meaningful names. > > > > > > Signed-off-by: Roman Gushchin <roman.gushchin@xxxxxxxxx> > > > --- > > > include/linux/shrinker.h | 19 ++++- > > > lib/Kconfig.debug | 9 +++ > > > mm/Makefile | 1 + > > > mm/shrinker_debug.c | 171 +++++++++++++++++++++++++++++++++++++++ > > > mm/vmscan.c | 6 +- > > > 5 files changed, 203 insertions(+), 3 deletions(-) > > > create mode 100644 mm/shrinker_debug.c > > > > > > diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h > > > index 76fbf92b04d9..2ced8149c513 100644 > > > --- a/include/linux/shrinker.h > > > +++ b/include/linux/shrinker.h > > > @@ -72,6 +72,10 @@ struct shrinker { > > > #ifdef CONFIG_MEMCG > > > /* ID in shrinker_idr */ > > > int id; > > > +#endif > > > +#ifdef CONFIG_SHRINKER_DEBUG > > > + int debugfs_id; > > > + struct dentry *debugfs_entry; > > > #endif > > > /* objs pending delete, per node */ > > > atomic_long_t *nr_deferred; > > > @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker); > > > extern void unregister_shrinker(struct shrinker *shrinker); > > > extern void free_prealloced_shrinker(struct shrinker *shrinker); > > > extern void synchronize_shrinkers(void); > > > -#endif > > > + > > > +#ifdef CONFIG_SHRINKER_DEBUG > > > +extern int shrinker_debugfs_add(struct shrinker *shrinker); > > > +extern void shrinker_debugfs_remove(struct shrinker *shrinker); > > > +#else /* CONFIG_SHRINKER_DEBUG */ > > > +static inline int shrinker_debugfs_add(struct shrinker *shrinker) > > > +{ > > > + return 0; > > > +} > > > +static inline void shrinker_debugfs_remove(struct shrinker *shrinker) > > > +{ > > > +} > > > +#endif /* CONFIG_SHRINKER_DEBUG */ > > > +#endif /* _LINUX_SHRINKER_H */ > > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > > index 3fd7a2e9eaf1..5fa65a649798 100644 > > > --- a/lib/Kconfig.debug > > > +++ b/lib/Kconfig.debug > > > @@ -733,6 +733,15 @@ config SLUB_STATS > > > out which slabs are relevant to a particular load. > > > Try running: slabinfo -DA > > > > > > +config SHRINKER_DEBUG > > > + default y > > > + bool "Enable shrinker debugging support" > > > + depends on DEBUG_FS > > > + help > > > + Say Y to enable the shrinker debugfs interface which provides > > > + visibility into the kernel memory shrinkers subsystem. > > > + Disable it to avoid an extra memory footprint. > > > + > > > config HAVE_DEBUG_KMEMLEAK > > > bool > > > > > > diff --git a/mm/Makefile b/mm/Makefile > > > index 298c9991ab75..8083fa85a348 100644 > > > --- a/mm/Makefile > > > +++ b/mm/Makefile > > > @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o > > > obj-$(CONFIG_IO_MAPPING) += io-mapping.o > > > obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o > > > obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o > > > +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o > > > diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c > > > new file mode 100644 > > > index 000000000000..fd1f805a581a > > > --- /dev/null > > > +++ b/mm/shrinker_debug.c > > > @@ -0,0 +1,171 @@ > > > +// SPDX-License-Identifier: GPL-2.0 > > > +#include <linux/idr.h> > > > +#include <linux/slab.h> > > > +#include <linux/debugfs.h> > > > +#include <linux/seq_file.h> > > > +#include <linux/shrinker.h> > > > +#include <linux/memcontrol.h> > > > + > > > +/* defined in vmscan.c */ > > > +extern struct rw_semaphore shrinker_rwsem; > > > +extern struct list_head shrinker_list; > > > + > > > +static DEFINE_IDA(shrinker_debugfs_ida); > > > +static struct dentry *shrinker_debugfs_root; > > > + > > > +static unsigned long shrinker_count_objects(struct shrinker *shrinker, > > > + struct mem_cgroup *memcg, > > > + unsigned long *count_per_node) > > > +{ > > > + unsigned long nr, total = 0; > > > + int nid; > > > + > > > + for_each_node(nid) { > > > + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) { > > > + struct shrink_control sc = { > > > + .gfp_mask = GFP_KERNEL, > > > + .nid = nid, > > > + .memcg = memcg, > > > + }; > > > + > > > + nr = shrinker->count_objects(shrinker, &sc); > > > + if (nr == SHRINK_EMPTY) > > > + nr = 0; > > > + } else { > > > + nr = 0; > > > > For efficiency, we could break here, right? > > Not really, we need to fill count_per_node[] with zeros. > I thought count_per_node[] was initialized with zero by the caller when allocated. However, I am wrong. Because it'll be reused in each loop. You are right. > > > > > + } > > > + > > > + count_per_node[nid] = nr; > > > + total += nr; > > > + } > > > + > > > + return total; > > > +} > > > + > > > +static int shrinker_debugfs_count_show(struct seq_file *m, void *v) > > > +{ > > > + struct shrinker *shrinker = (struct shrinker *)m->private; > > > > Maybe we cound drop the cast since m->private is a void * type. > > Ok. > > > > > > + unsigned long *count_per_node = NULL; > > > > Do not need to be initialized, right? > > Right, will fix in v4. > > > > > > + struct mem_cgroup *memcg; > > > + unsigned long total; > > > + bool memcg_aware; > > > + int ret, nid; > > > + > > > + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); > > > + if (!count_per_node) > > > + return -ENOMEM; > > > + > > > + ret = down_read_killable(&shrinker_rwsem); > > > + if (ret) { > > > + kfree(count_per_node); > > > + return ret; > > > + } > > > + rcu_read_lock(); > > > + > > > + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; > > > + > > > + memcg = mem_cgroup_iter(NULL, NULL, NULL); > > > + do { > > > + if (memcg && !mem_cgroup_online(memcg)) > > > + continue; > > > + > > > + total = shrinker_count_objects(shrinker, > > > + memcg_aware ? memcg : NULL, > > > + count_per_node); > > > + if (total) { > > > + seq_printf(m, "%lu", mem_cgroup_ino(memcg)); > > > + for_each_node(nid) > > > + seq_printf(m, " %lu", count_per_node[nid]); > > > + seq_puts(m, "\n"); > > > > seq_putc(m, '\n') is more efficient. > > Ok. > > > > > > + } > > > + > > > + if (!memcg_aware) { > > > + mem_cgroup_iter_break(NULL, memcg); > > > + break; > > > + } > > > + > > > + if (signal_pending(current)) { > > > + mem_cgroup_iter_break(NULL, memcg); > > > + ret = -EINTR; > > > + break; > > > + } > > > + > > > + cond_resched(); > > > > We are in rcu read lock, cannot be scheduled, right? > > This is a good one, thanks. Fixed. > > > > > > + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); > > > + > > > + rcu_read_unlock(); > > > + up_read(&shrinker_rwsem); > > > + > > > + kfree(count_per_node); > > > + return ret; > > > +} > > > +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); > > > + > > > +int shrinker_debugfs_add(struct shrinker *shrinker) > > > +{ > > > + struct dentry *entry; > > > + char buf[16]; > > > + int id; > > > + > > > + lockdep_assert_held(&shrinker_rwsem); > > > + > > > + /* debugfs isn't initialized yet, add debugfs entries later. */ > > > + if (!shrinker_debugfs_root) > > > + return 0; > > > + > > > + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL); > > > + if (id < 0) > > > + return id; > > > + shrinker->debugfs_id = id; > > > + > > > + snprintf(buf, sizeof(buf), "%d", id); > > > + > > > + /* create debugfs entry */ > > > + entry = debugfs_create_dir(buf, shrinker_debugfs_root); > > > + if (IS_ERR(entry)) { > > > + ida_free(&shrinker_debugfs_ida, id); > > > + return PTR_ERR(entry); > > > + } > > > + shrinker->debugfs_entry = entry; > > > + > > > + debugfs_create_file("count", 0220, entry, shrinker, > > > + &shrinker_debugfs_count_fops); > > > + return 0; > > > +} > > > + > > > +void shrinker_debugfs_remove(struct shrinker *shrinker) > > > +{ > > > + lockdep_assert_held(&shrinker_rwsem); > > > + > > > + if (!shrinker->debugfs_entry) > > > + return; > > > + > > > + debugfs_remove_recursive(shrinker->debugfs_entry); > > > + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); > > > +} > > > + > > > +static int __init shrinker_debugfs_init(void) > > > +{ > > > + struct shrinker *shrinker; > > > + int ret; > > > + > > > + if (!debugfs_initialized()) > > > + return -ENODEV; > > > + > > > > Redundant check since it is checked in debugfs_create_dir(). > > So I think we could remove this. > > > > > + shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL); > > > > We should use IS_ERR() to detect the error code. So the following > > check is wrong. > > Right, will fix in the next version. > > > > > > + if (!shrinker_debugfs_root) > > > + return -ENOMEM; > > > + > > > + /* Create debugfs entries for shrinkers registered at boot */ > > > + ret = down_write_killable(&shrinker_rwsem); > > > > How could we kill this process? IIUC, late_initcall() is called > > from early init process, there is no way to kill this. Right? > > If yes, I think we could just use down_write(). > > Ok, agree. > > Thanks! >