On Mon, Sep 11, 2023 at 12:55 AM Yakunin, Dmitry (Nebius) <zeil@xxxxxxxxxx> wrote: > > After removing cgroup subsystem state could leak or live in background > forever because it is pinned by some reference. For example memory cgroup > could be pinned by pages in cache or tmpfs. > > This patch adds common debugfs interface for listing basic state for each > controller. Controller could define callback for dumping own attributes. > > In file /sys/kernel/debug/cgroup/<controller> each line shows state in > format: <common_attr>=<value>... [-- <controller_attr>=<value>... ] > > Common attributes: > > css - css pointer > cgroup - cgroup pointer > id - css id > ino - cgroup inode > flags - css flags > refcnt - css atomic refcount, for online shows huge bias > path - cgroup path > > This patch adds memcg attributes: > > mem_id - 16-bit memory cgroup id > memory - charged pages > memsw - charged memory+swap for v1 and swap for v2 > kmem - charged kernel pages > tcpmem - charged tcp pages > shmem - shmem/tmpfs pages > > Link: https://lore.kernel.org/lkml/153414348591.737150.14229960913953276515.stgit@buzz > Suggested-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx> > Reviewed-by: Andrey Ryabinin <arbn@xxxxxxxxxxxxxxx> > Signed-off-by: Dmitry Yakunin <zeil@xxxxxxxxxx> FWIW, I was just recently working on a debugfs directly that exposes a list of all zombie memcgs as well as the "memory.stat" output for all of them. This entails a file at /sys/kernel/debug/zombie_memcgs/all that contains a list of zombie memcgs (with indentation to reflect the hierarchy) and an id for each of them. This id can be used to index per-memcg directories at /sys/kernel/debug/zombie_memcgs/<id>/, which include debug files. The only one we have so far is /sys/kernel/debug/zombie_memcgs/<id>/memory.stat. If there is interest in this, I can share more information. > --- > include/linux/cgroup-defs.h | 1 + > kernel/cgroup/cgroup.c | 101 ++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 14 +++++ > 3 files changed, 116 insertions(+) > > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 8a0d5466c7be..810bd300cbee 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -673,6 +673,7 @@ struct cgroup_subsys { > void (*exit)(struct task_struct *task); > void (*release)(struct task_struct *task); > void (*bind)(struct cgroup_subsys_state *root_css); > + void (*css_dump)(struct cgroup_subsys_state *css, struct seq_file *m); > > bool early_init:1; > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 625d7483951c..fb9931ff7570 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -40,6 +40,7 @@ > #include <linux/mount.h> > #include <linux/pagemap.h> > #include <linux/proc_fs.h> > +#include <linux/debugfs.h> > #include <linux/rcupdate.h> > #include <linux/sched.h> > #include <linux/sched/task.h> > @@ -7068,3 +7069,103 @@ static int __init cgroup_sysfs_init(void) > subsys_initcall(cgroup_sysfs_init); > > #endif /* CONFIG_SYSFS */ > + > +#ifdef CONFIG_DEBUG_FS > +void *css_debugfs_seqfile_start(struct seq_file *m, loff_t *pos) > +{ > + struct cgroup_subsys *ss = m->private; > + struct cgroup_subsys_state *css; > + int id = *pos; > + > + rcu_read_lock(); > + css = idr_get_next(&ss->css_idr, &id); > + *pos = id; > + return css; > +} > + > +void *css_debugfs_seqfile_next(struct seq_file *m, void *v, loff_t *pos) > +{ > + struct cgroup_subsys *ss = m->private; > + struct cgroup_subsys_state *css; > + int id = *pos + 1; > + > + css = idr_get_next(&ss->css_idr, &id); > + *pos = id; > + return css; > +} > + > +void css_debugfs_seqfile_stop(struct seq_file *m, void *v) > +{ > + rcu_read_unlock(); > +} > + > +int css_debugfs_seqfile_show(struct seq_file *m, void *v) > +{ > + struct cgroup_subsys *ss = m->private; > + struct cgroup_subsys_state *css = v; > + /* data is NULL for root cgroup_subsys_state */ > + struct percpu_ref_data *data = css->refcnt.data; > + size_t buflen; > + char *buf; > + int len; > + > + seq_printf(m, "css=%pK cgroup=%pK id=%d ino=%lu flags=%#x refcnt=%lu path=", > + css, css->cgroup, css->id, cgroup_ino(css->cgroup), > + css->flags, data ? atomic_long_read(&data->count) : 0); > + > + buflen = seq_get_buf(m, &buf); > + if (buf) { > + len = cgroup_path(css->cgroup, buf, buflen); > + seq_commit(m, len < buflen ? len : -1); > + } > + > + if (ss->css_dump) { > + seq_puts(m, " -- "); > + ss->css_dump(css, m); > + } > + > + seq_putc(m, '\n'); > + return 0; > +} > + > +static const struct seq_operations css_debug_seq_ops = { > + .start = css_debugfs_seqfile_start, > + .next = css_debugfs_seqfile_next, > + .stop = css_debugfs_seqfile_stop, > + .show = css_debugfs_seqfile_show, > +}; > + > +static int css_debugfs_open(struct inode *inode, struct file *file) > +{ > + int ret = seq_open(file, &css_debug_seq_ops); > + struct seq_file *m = file->private_data; > + > + if (!ret) > + m->private = inode->i_private; > + return ret; > +} > + > +static const struct file_operations css_debugfs_fops = { > + .open = css_debugfs_open, > + .read = seq_read, > + .llseek = seq_lseek, > + .release = seq_release, > +}; > + > +static int __init css_debugfs_init(void) > +{ > + struct cgroup_subsys *ss; > + struct dentry *dir; > + int ssid; > + > + dir = debugfs_create_dir("cgroup", NULL); > + if (dir) { > + for_each_subsys(ss, ssid) > + debugfs_create_file(ss->name, 0644, dir, ss, > + &css_debugfs_fops); > + } > + > + return 0; > +} > +late_initcall(css_debugfs_init); > +#endif /* CONFIG_DEBUG_FS */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 4b27e245a055..7b3d4a10ac63 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5654,6 +5654,20 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > } > } > > +static void mem_cgroup_css_dump(struct cgroup_subsys_state *css, > + struct seq_file *m) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(css); > + > + seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu", > + mem_cgroup_id(memcg), > + page_counter_read(&memcg->memory), > + page_counter_read(&memcg->memsw), > + page_counter_read(&memcg->kmem), > + page_counter_read(&memcg->tcpmem), > + memcg_page_state(memcg, NR_SHMEM)); > +} > + > #ifdef CONFIG_MMU > /* Handlers for move charge at task migration. */ > static int mem_cgroup_do_precharge(unsigned long count) > -- > 2.25.1 > >