On Tue, Feb 20, 2018 at 11:41 AM, Shakeel Butt <shakeelb@xxxxxxxxxx> wrote: > A lot of memory can be consumed by the events generated for the huge or > unlimited queues if there is either no or slow listener. This can cause > system level memory pressure or OOMs. So, it's better to account the > fsnotify kmem caches to the memcg of the listener. > > There are seven fsnotify kmem caches and among them allocations from > dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and > inotify_inode_mark_cachep happens in the context of syscall from the > listener. So, SLAB_ACCOUNT is enough for these caches. > > The objects from fsnotify_mark_connector_cachep are not accounted as > they are small compared to the notification mark or events and it is > unclear whom to account connector to since it is shared by all events > attached to the inode. > > The allocations from the event caches happen in the context of the event > producer. For such caches we will need to remote charge the allocations > to the listener's memcg. Thus we save the memcg reference in the > fsnotify_group structure of the listener. > > This patch has also moved the members of fsnotify_group to keep the > size same, at least for 64 bit build, even with additional member by > filling the holes. > > Signed-off-by: Shakeel Butt <shakeelb@xxxxxxxxxx> Andrew, please don't send this patch to Linus until Jan Kara's changes are merged. I will let you know when that happens. > --- > Changelog since v1: > - no more charging fsnotify_mark_connector objects > > fs/notify/dnotify/dnotify.c | 5 +++-- > fs/notify/fanotify/fanotify.c | 12 +++++++----- > fs/notify/fanotify/fanotify.h | 3 ++- > fs/notify/fanotify/fanotify_user.c | 7 +++++-- > fs/notify/group.c | 4 ++++ > fs/notify/inotify/inotify_fsnotify.c | 2 +- > fs/notify/inotify/inotify_user.c | 5 ++++- > fs/notify/mark.c | 6 ++++-- > include/linux/fsnotify_backend.h | 12 ++++++++---- > include/linux/memcontrol.h | 7 +++++++ > mm/memcontrol.c | 2 +- > 11 files changed, 46 insertions(+), 19 deletions(-) > > diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c > index 63a1ca4b9dee..eb5c41284649 100644 > --- a/fs/notify/dnotify/dnotify.c > +++ b/fs/notify/dnotify/dnotify.c > @@ -384,8 +384,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) > > static int __init dnotify_init(void) > { > - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); > - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); > + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, > + SLAB_PANIC|SLAB_ACCOUNT); > + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); > > dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); > if (IS_ERR(dnotify_group)) > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index 6702a6a0bbb5..0d9493ebc7cd 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -140,22 +140,24 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, > } > > struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, > - const struct path *path) > + const struct path *path, > + struct mem_cgroup *memcg) > { > struct fanotify_event_info *event; > > if (fanotify_is_perm_event(mask)) { > struct fanotify_perm_event_info *pevent; > > - pevent = kmem_cache_alloc(fanotify_perm_event_cachep, > - GFP_KERNEL); > + pevent = kmem_cache_alloc_memcg(fanotify_perm_event_cachep, > + GFP_KERNEL, memcg); > if (!pevent) > return NULL; > event = &pevent->fae; > pevent->response = 0; > goto init; > } > - event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); > + event = kmem_cache_alloc_memcg(fanotify_event_cachep, GFP_KERNEL, > + memcg); > if (!event) > return NULL; > init: __maybe_unused > @@ -210,7 +212,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, > return 0; > } > > - event = fanotify_alloc_event(inode, mask, data); > + event = fanotify_alloc_event(inode, mask, data, group->memcg); > ret = -ENOMEM; > if (unlikely(!event)) > goto finish; > diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h > index 256d9d1ddea9..51b797896c87 100644 > --- a/fs/notify/fanotify/fanotify.h > +++ b/fs/notify/fanotify/fanotify.h > @@ -53,4 +53,5 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) > } > > struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, > - const struct path *path); > + const struct path *path, > + struct mem_cgroup *memcg); > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index ef08d64c84b8..29c9b3e57a29 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -16,6 +16,7 @@ > #include <linux/uaccess.h> > #include <linux/compat.h> > #include <linux/sched/signal.h> > +#include <linux/memcontrol.h> > > #include <asm/ioctls.h> > > @@ -756,8 +757,9 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) > > group->fanotify_data.user = user; > atomic_inc(&user->fanotify_listeners); > + group->memcg = get_mem_cgroup_from_mm(current->mm); > > - oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); > + oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL, group->memcg); > if (unlikely(!oevent)) { > fd = -ENOMEM; > goto out_destroy_group; > @@ -951,7 +953,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, > */ > static int __init fanotify_user_setup(void) > { > - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); > + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, > + SLAB_PANIC|SLAB_ACCOUNT); > fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); > if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { > fanotify_perm_event_cachep = > diff --git a/fs/notify/group.c b/fs/notify/group.c > index b7a4b6a69efa..3e56459f4773 100644 > --- a/fs/notify/group.c > +++ b/fs/notify/group.c > @@ -22,6 +22,7 @@ > #include <linux/srcu.h> > #include <linux/rculist.h> > #include <linux/wait.h> > +#include <linux/memcontrol.h> > > #include <linux/fsnotify_backend.h> > #include "fsnotify.h" > @@ -36,6 +37,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) > if (group->ops->free_group_priv) > group->ops->free_group_priv(group); > > + if (group->memcg) > + mem_cgroup_put(group->memcg); > + > kfree(group); > } > > diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c > index 8b73332735ba..ed8e7b5f3981 100644 > --- a/fs/notify/inotify/inotify_fsnotify.c > +++ b/fs/notify/inotify/inotify_fsnotify.c > @@ -98,7 +98,7 @@ int inotify_handle_event(struct fsnotify_group *group, > i_mark = container_of(inode_mark, struct inotify_inode_mark, > fsn_mark); > > - event = kmalloc(alloc_len, GFP_KERNEL); > + event = kmalloc_memcg(alloc_len, GFP_KERNEL, group->memcg); > if (unlikely(!event)) > return -ENOMEM; > > diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c > index 5c29bf16814f..e80f4656799f 100644 > --- a/fs/notify/inotify/inotify_user.c > +++ b/fs/notify/inotify/inotify_user.c > @@ -38,6 +38,7 @@ > #include <linux/uaccess.h> > #include <linux/poll.h> > #include <linux/wait.h> > +#include <linux/memcontrol.h> > > #include "inotify.h" > #include "../fdinfo.h" > @@ -618,6 +619,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) > oevent->name_len = 0; > > group->max_events = max_events; > + group->memcg = get_mem_cgroup_from_mm(current->mm); > > spin_lock_init(&group->inotify_data.idr_lock); > idr_init(&group->inotify_data.idr); > @@ -785,7 +787,8 @@ static int __init inotify_user_setup(void) > > BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); > > - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); > + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, > + SLAB_PANIC|SLAB_ACCOUNT); > > inotify_max_queued_events = 16384; > init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index e9191b416434..c0014d0c3783 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -432,7 +432,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > static int fsnotify_attach_connector_to_object( > struct fsnotify_mark_connector __rcu **connp, > struct inode *inode, > - struct vfsmount *mnt) > + struct vfsmount *mnt, > + struct fsnotify_group *group) > { > struct fsnotify_mark_connector *conn; > > @@ -517,7 +518,8 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > conn = fsnotify_grab_connector(connp); > if (!conn) { > spin_unlock(&mark->lock); > - err = fsnotify_attach_connector_to_object(connp, inode, mnt); > + err = fsnotify_attach_connector_to_object(connp, inode, mnt, > + mark->group); > if (err) > return err; > goto restart; > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 067d52e95f02..e4428e383215 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -84,6 +84,8 @@ struct fsnotify_event_private_data; > struct fsnotify_fname; > struct fsnotify_iter_info; > > +struct mem_cgroup; > + > /* > * Each group much define these ops. The fsnotify infrastructure will call > * these operations for each relevant group. > @@ -129,6 +131,8 @@ struct fsnotify_event { > * everything will be cleaned up. > */ > struct fsnotify_group { > + const struct fsnotify_ops *ops; /* how this group handles things */ > + > /* > * How the refcnt is used is up to each group. When the refcnt hits 0 > * fsnotify will clean up all of the resources associated with this group. > @@ -139,8 +143,6 @@ struct fsnotify_group { > */ > refcount_t refcnt; /* things with interest in this group */ > > - const struct fsnotify_ops *ops; /* how this group handles things */ > - > /* needed to send notification to userspace */ > spinlock_t notification_lock; /* protect the notification_list */ > struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ > @@ -162,6 +164,8 @@ struct fsnotify_group { > atomic_t num_marks; /* 1 for each mark and 1 for not being > * past the point of no return when freeing > * a group */ > + atomic_t user_waits; /* Number of tasks waiting for user > + * response */ > struct list_head marks_list; /* all inode marks for this group */ > > struct fasync_struct *fsn_fa; /* async notification */ > @@ -169,8 +173,8 @@ struct fsnotify_group { > struct fsnotify_event *overflow_event; /* Event we queue when the > * notification list is too > * full */ > - atomic_t user_waits; /* Number of tasks waiting for user > - * response */ > + > + struct mem_cgroup *memcg; /* memcg to charge allocations */ > > /* groups can define private fields here or use the void *private */ > union { > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 9dec8a5c0ca2..ee4b6b9d6813 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -352,6 +352,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ > return css ? container_of(css, struct mem_cgroup, css) : NULL; > } > > +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); > + > static inline void mem_cgroup_put(struct mem_cgroup *memcg) > { > css_put(&memcg->css); > @@ -809,6 +811,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, > return true; > } > > +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) > +{ > + return NULL; > +} > + > static inline void mem_cgroup_put(struct mem_cgroup *memcg) > { > } > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 0dcd6ab6cc94..3a72394510a7 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -678,7 +678,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) > } > EXPORT_SYMBOL(mem_cgroup_from_task); > > -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) > +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) > { > struct mem_cgroup *memcg = NULL; > > -- > 2.16.1.291.g4437f3f132-goog >