On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote: > > Another approach may be to identify filesystem types that do not > need memcg awareness and feed that into alloc_super() to set/clear > the SHRINKER_MEMCG_AWARE flag. This could be based on fstype - most > virtual filesystems that expose system information do not really > need full memcg awareness because they are generally only visible to > a single memcg instance... Would something like below be appropriate? >From f314083ad69fde2a420a1b74febd6d3f7a25085f Mon Sep 17 00:00:00 2001 From: Bharata B Rao <bharata@xxxxxxxxxxxxx> Date: Wed, 14 Apr 2021 11:21:24 +0530 Subject: [PATCH 1/1] fs: Let filesystems opt out of memcg awareness All filesystem mounts by default are memcg aware and end hence end up creating shrinker list_lrus for all the memcgs. Due to the way the memcg_nr_cache_ids grow and the list_lru heads are allocated for all memcgs, huge amount of memory gets consumed by kmalloc-32 slab cache when running thousands of containers. Improve this situation by allowing filesystems to opt out of memcg awareness. In this patch, tmpfs, proc and ramfs opt out of memcg awareness. This leads to considerable memory savings when running 10k containers. Signed-off-by: Bharata B Rao <bharata@xxxxxxxxxxxxx> --- fs/proc/root.c | 1 + fs/ramfs/inode.c | 1 + fs/super.c | 27 +++++++++++++++++++-------- include/linux/fs_context.h | 2 ++ mm/shmem.c | 1 + 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index c7e3b1350ef8..7856bc2ca9f4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -257,6 +257,7 @@ static int proc_init_fs_context(struct fs_context *fc) fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; + fc->memcg_optout = true; return 0; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 9ebd17d7befb..576a88bb7407 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -278,6 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc) fsi->mount_opts.mode = RAMFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &ramfs_context_ops; + fc->memcg_optout = true; return 0; } diff --git a/fs/super.c b/fs/super.c index 8c1baca35c16..59aa22c678e6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -198,7 +198,8 @@ static void destroy_unused_super(struct super_block *s) * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, - struct user_namespace *user_ns) + struct user_namespace *user_ns, + bool memcg_optout) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -266,13 +267,22 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; + s->s_shrink.flags = SHRINKER_NUMA_AWARE; + if (!memcg_optout) + s->s_shrink.flags |= SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) - goto fail; - if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) - goto fail; + if (memcg_optout) { + if (list_lru_init(&s->s_dentry_lru)) + goto fail; + if (list_lru_init(&s->s_inode_lru)) + goto fail; + } else { + if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) + goto fail; + if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) + goto fail; + } return s; fail: @@ -527,7 +537,8 @@ struct super_block *sget_fc(struct fs_context *fc, } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); + s = alloc_super(fc->fs_type, fc->sb_flags, user_ns, + fc->memcg_optout); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -610,7 +621,7 @@ struct super_block *sget(struct file_system_type *type, } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); + s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns, false); if (!s) return ERR_PTR(-ENOMEM); goto retry; diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 37e1e8f7f08d..73388c0b6950 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -110,6 +110,8 @@ struct fs_context { bool need_free:1; /* Need to call ops->free() */ bool global:1; /* Goes into &init_user_ns */ bool oldapi:1; /* Coming from mount(2) */ + bool memcg_optout:1; /* Opt out from per-memcg + lru handling */ }; struct fs_context_operations { diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..0c9b2af52825 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3915,6 +3915,7 @@ int shmem_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; + fc->memcg_optout = true; return 0; } -- 2.26.2