On Mon, Jan 26, 2015 at 03:55:29PM +0300, Vladimir Davydov wrote: > To speed up further allocations SLUB may store empty slabs in per > cpu/node partial lists instead of freeing them immediately. This > prevents per memcg caches destruction, because kmem caches created for a > memory cgroup are only destroyed after the last page charged to the > cgroup is freed. > > To fix this issue, this patch resurrects approach first proposed in [1]. > It forbids SLUB to cache empty slabs after the memory cgroup that the > cache belongs to was destroyed. It is achieved by setting kmem_cache's > cpu_partial and min_partial constants to 0 and tuning put_cpu_partial() > so that it would drop frozen empty slabs immediately if cpu_partial = 0. > > The runtime overhead is minimal. From all the hot functions, we only > touch relatively cold put_cpu_partial(): we make it call > unfreeze_partials() after freezing a slab that belongs to an offline > memory cgroup. Since slab freezing exists to avoid moving slabs from/to > a partial list on free/alloc, and there can't be allocations from dead > caches, it shouldn't cause any overhead. We do have to disable > preemption for put_cpu_partial() to achieve that though. > > The original patch was accepted well and even merged to the mm tree. > However, I decided to withdraw it due to changes happening to the memcg > core at that time. I had an idea of introducing per-memcg shrinkers for > kmem caches, but now, as memcg has finally settled down, I do not see it > as an option, because SLUB shrinker would be too costly to call since > SLUB does not keep free slabs on a separate list. Besides, we currently > do not even call per-memcg shrinkers for offline memcgs. Overall, it > would introduce much more complexity to both SLUB and memcg than this > small patch. > > Regarding to SLAB, there's no problem with it, because it shrinks > per-cpu/node caches periodically. Thanks to list_lru reparenting, we no > longer keep entries for offline cgroups in per-memcg arrays (such as > memcg_cache_params->memcg_caches), so we do not have to bother if a > per-memcg cache will be shrunk a bit later than it could be. > > [1] http://thread.gmane.org/gmane.linux.kernel.mm/118649/focus=118650 > > Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> > --- > mm/slab.c | 2 +- > mm/slab.h | 2 +- > mm/slab_common.c | 15 +++++++++++++-- > mm/slob.c | 2 +- > mm/slub.c | 25 ++++++++++++++++++++----- > 5 files changed, 36 insertions(+), 10 deletions(-) > > diff --git a/mm/slab.c b/mm/slab.c > index 279c44d6d8e1..f0514df07b85 100644 > --- a/mm/slab.c > +++ b/mm/slab.c > @@ -2400,7 +2400,7 @@ static int __cache_shrink(struct kmem_cache *cachep) > return (ret ? 1 : 0); > } > > -void __kmem_cache_shrink(struct kmem_cache *cachep) > +void __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) > { > __cache_shrink(cachep); > } > diff --git a/mm/slab.h b/mm/slab.h > index c036e520d2cf..041260197984 100644 > --- a/mm/slab.h > +++ b/mm/slab.h > @@ -138,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, > #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) > > int __kmem_cache_shutdown(struct kmem_cache *); > -void __kmem_cache_shrink(struct kmem_cache *); > +void __kmem_cache_shrink(struct kmem_cache *, bool); > void slab_kmem_cache_release(struct kmem_cache *); > > struct seq_file; > diff --git a/mm/slab_common.c b/mm/slab_common.c > index 6803639fdff0..472ab7fcffd4 100644 > --- a/mm/slab_common.c > +++ b/mm/slab_common.c > @@ -549,10 +549,13 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) > { > int idx; > struct memcg_cache_array *arr; > - struct kmem_cache *s; > + struct kmem_cache *s, *c; > > idx = memcg_cache_id(memcg); > > + get_online_cpus(); > + get_online_mems(); > + > mutex_lock(&slab_mutex); > list_for_each_entry(s, &slab_caches, list) { > if (!is_root_cache(s)) > @@ -560,9 +563,17 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) > > arr = rcu_dereference_protected(s->memcg_params.memcg_caches, > lockdep_is_held(&slab_mutex)); > + c = arr->entries[idx]; > + if (!c) > + continue; > + > + __kmem_cache_shrink(c, true); > arr->entries[idx] = NULL; > } > mutex_unlock(&slab_mutex); > + > + put_online_mems(); > + put_online_cpus(); > } > > void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) > @@ -646,7 +657,7 @@ void kmem_cache_shrink(struct kmem_cache *cachep) > { > get_online_cpus(); > get_online_mems(); > - __kmem_cache_shrink(cachep); > + __kmem_cache_shrink(cachep, false); > put_online_mems(); > put_online_cpus(); > } > diff --git a/mm/slob.c b/mm/slob.c > index 043a14b6ccbe..e63ff9d926dc 100644 > --- a/mm/slob.c > +++ b/mm/slob.c > @@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c) > return 0; > } > > -void __kmem_cache_shrink(struct kmem_cache *c) > +void __kmem_cache_shrink(struct kmem_cache *c, bool deactivate) > { > } > > diff --git a/mm/slub.c b/mm/slub.c > index c09d93dde40e..6f57824af019 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) > int pages; > int pobjects; > > + preempt_disable(); > do { > pages = 0; > pobjects = 0; > @@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) > > } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) > != oldpage); > + if (unlikely(!s->cpu_partial)) { > + unsigned long flags; > + > + local_irq_save(flags); > + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); > + local_irq_restore(flags); > + } > + preempt_enable(); > #endif > } > > @@ -3368,7 +3377,7 @@ EXPORT_SYMBOL(kfree); > * being allocated from last increasing the chance that the last objects > * are freed in them. > */ > -void __kmem_cache_shrink(struct kmem_cache *s) > +void __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) > { > int node; > int i; > @@ -3381,6 +3390,15 @@ void __kmem_cache_shrink(struct kmem_cache *s) > kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); > unsigned long flags; > > + if (deactivate) { > + /* > + * Disable empty slabs caching. Used to avoid pinning offline > + * memory cgroups by freeable kmem pages. > + */ > + s->cpu_partial = 0; > + s->min_partial = 0; > + } > + Hello, Maybe, kick_all_cpus_sync() is needed here since object would be freed asynchronously so they can't see this updated value. Thanks. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>