On Tue, Jun 04, 2019 at 07:44:54PM -0700, Roman Gushchin wrote: > Let's reparent memcg slab memory on memcg offlining. This allows us > to release the memory cgroup without waiting for the last outstanding > kernel object (e.g. dentry used by another application). > > So instead of reparenting all accounted slab pages, let's do reparent > a relatively small amount of kmem_caches. Reparenting is performed as > a part of the deactivation process. > > Since the parent cgroup is already charged, everything we need to do > is to splice the list of kmem_caches to the parent's kmem_caches list, > swap the memcg pointer and drop the css refcounter for each kmem_cache > and adjust the parent's css refcounter. Quite simple. > > Please, note that kmem_cache->memcg_params.memcg isn't a stable > pointer anymore. It's safe to read it under rcu_read_lock() or > with slab_mutex held. > > We can race with the slab allocation and deallocation paths. It's not > a big problem: parent's charge and slab global stats are always > correct, and we don't care anymore about the child usage and global > stats. The child cgroup is already offline, so we don't use or show it > anywhere. > > Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) > aren't used anywhere except count_shadow_nodes(). But even there it > won't break anything: after reparenting "nodes" will be 0 on child > level (because we're already reparenting shrinker lists), and on > parent level page stats always were 0, and this patch won't change > anything. > > Signed-off-by: Roman Gushchin <guro@xxxxxx> > --- > include/linux/slab.h | 4 ++-- > mm/list_lru.c | 8 +++++++- > mm/memcontrol.c | 14 ++++++++------ > mm/slab.h | 23 +++++++++++++++++------ > mm/slab_common.c | 22 +++++++++++++++++++--- > 5 files changed, 53 insertions(+), 18 deletions(-) > > diff --git a/include/linux/slab.h b/include/linux/slab.h > index 1b54e5f83342..109cab2ad9b4 100644 > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -152,7 +152,7 @@ void kmem_cache_destroy(struct kmem_cache *); > int kmem_cache_shrink(struct kmem_cache *); > > void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); > -void memcg_deactivate_kmem_caches(struct mem_cgroup *); > +void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); > > /* > * Please use this macro to create slab caches. Simply specify the > @@ -638,7 +638,7 @@ struct memcg_cache_params { > bool dying; > }; > struct { > - struct mem_cgroup *memcg; > + struct mem_cgroup __rcu *memcg; > struct list_head children_node; > struct list_head kmem_caches_node; > struct percpu_ref refcnt; > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 0f1f6b06b7f3..0b2319897e86 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -77,11 +77,15 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, > if (!nlru->memcg_lrus) > goto out; > > + rcu_read_lock(); > memcg = mem_cgroup_from_kmem(ptr); > - if (!memcg) > + if (!memcg) { > + rcu_read_unlock(); > goto out; > + } > > l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); > + rcu_read_unlock(); > out: > if (memcg_ptr) > *memcg_ptr = memcg; > @@ -131,12 +135,14 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) > > spin_lock(&nlru->lock); > if (list_empty(item)) { > + rcu_read_lock(); > l = list_lru_from_kmem(nlru, item, &memcg); > list_add_tail(item, &l->list); > /* Set shrinker bit if the first element was added */ > if (!l->nr_items++) > memcg_set_shrinker_bit(memcg, nid, > lru_shrinker_id(lru)); > + rcu_read_unlock(); AFAICS we don't need rcu_read_lock here, because holding nlru->lock guarantees that the cgroup doesn't get freed. If that's correct, I think we better remove __rcu mark and use READ_ONCE for accessing memcg_params.memcg, thus making it the caller's responsibility to ensure the cgroup lifetime. > nlru->nr_items++; > spin_unlock(&nlru->lock); > return true; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c097b1fc74ec..0f64a2c06803 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3209,15 +3209,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) > */ > memcg->kmem_state = KMEM_ALLOCATED; > > - memcg_deactivate_kmem_caches(memcg); > - > - kmemcg_id = memcg->kmemcg_id; > - BUG_ON(kmemcg_id < 0); > - > parent = parent_mem_cgroup(memcg); > if (!parent) > parent = root_mem_cgroup; > > + memcg_deactivate_kmem_caches(memcg, parent); > + > + kmemcg_id = memcg->kmemcg_id; > + BUG_ON(kmemcg_id < 0); > + > /* > * Change kmemcg_id of this cgroup and all its descendants to the > * parent's id, and then move all entries from this cgroup's list_lrus > @@ -3250,7 +3250,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) > if (memcg->kmem_state == KMEM_ALLOCATED) { > WARN_ON(!list_empty(&memcg->kmem_caches)); > static_branch_dec(&memcg_kmem_enabled_key); > - WARN_ON(page_counter_read(&memcg->kmem)); > } > } > #else > @@ -4675,6 +4674,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) > > /* The following stuff does not apply to the root */ > if (!parent) { > +#ifdef CONFIG_MEMCG_KMEM > + INIT_LIST_HEAD(&memcg->kmem_caches); > +#endif > root_mem_cgroup = memcg; > return &memcg->css; > } > diff --git a/mm/slab.h b/mm/slab.h > index 7ead47cb9338..34bf92382ecd 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -268,7 +268,7 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) > > s = READ_ONCE(page->slab_cache); > if (s && !is_root_cache(s)) > - return s->memcg_params.memcg; > + return rcu_dereference(s->memcg_params.memcg); I guess it's worth updating the comment with a few words re cgroup lifetime.