To speed up further allocations, SLUB may keep some empty slabs on per cpu/node partial lists. If the cache is dead, i.e. belongs to a memcg that was turned offline, there is no need in that, because dead caches are never allocated from. What is worse, keeping empty slabs on the list will prevent the dead cache from self destruction, because each allocated slab holds a reference to the cache, and the cache can only be destroyed when its ref counter is zero. To make a SLUB cache discard slabs as soon as they become empty, we have to (1) drain per cpu/node caches, (2) disable caching of empty slabs on per node list, (3) disable per cpu partial lists completely, as a slab that becomes empty on such a list will be freed only when unfreeze_partials() is called, which can never happen under certain circumstances. (1) is already done by kmem_cache_shrink(), which is called from memcg_unregister_all_caches(). (2) is easy. It's enough to set kmem_cache->min_partial to 0 before shrinking the cache. Since min_partial is only accessed under kmem_cache_node->lock, and we take the lock while shrinking the cache, this will guarantee no empty slabs will be added to the list after kmem_cache_shrink is called. (3) is a bit more difficult, because slabs are added to per-cpu partial lists lock-less. Fortunately, we only have to handle the __slab_free case, because, as there shouldn't be any allocation requests dispatched to a dead memcg cache, get_partial_node() should never be called. In __slab_free we use cmpxchg to modify kmem_cache_cpu->partial (see put_cpu_partial) so that setting ->partial to a special value, which will make put_cpu_partial bail out, will do the trick. Note, this shouldn't affect performance, because keeping empty slabs on per node lists as well as using per cpu partials are only worthwhile if the cache is used for allocations, which isn't the case for dead caches. Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> --- mm/slub.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 29 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ac39cc9b6849..d5a54b03d558 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -133,6 +133,21 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) } /* + * When the owner memcg of a cache is turned offline, its per cpu partial lists + * must be disabled, because they can contain empty slabs and hence prevent the + * cache from self-destruction. + * + * To zap per cpu partials, we set each cpu's cpu_slab->partial to a special + * value, CPU_SLAB_PARTIAL_DEAD. Whenever we see this value while trying to put + * a frozen slab to a per cpu partial list, we unfreeze the slab and put it + * back to its node's list. + * + * Actually, we only need to handle this on free path, because no allocations + * requests can be dispatched to a dead memcg cache. + */ +#define CPU_SLAB_PARTIAL_DEAD ((struct page *)~0UL) + +/* * Issues still to be resolved: * * - Support PAGE_ALLOC_DEBUG. Should be easy to do. @@ -1643,6 +1658,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, stat(s, ALLOC_FROM_PARTIAL); object = t; } else { + VM_BUG_ON(c->partial == CPU_SLAB_PARTIAL_DEAD); prepare_cpu_partial(page, c->partial); c->partial = page; stat(s, CPU_PARTIAL_NODE); @@ -1948,6 +1964,56 @@ redo: } } +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void __unfreeze_partial(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page, struct page **discard_page) +{ + struct page new, old; + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + page->next = *discard_page; + *discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } +} + +static void cancel_frozen(struct kmem_cache *s, struct page *page) +{ + struct page *discard_page = NULL; + struct kmem_cache_node *n; + unsigned long flags; + + n = get_node(s, page_to_nid(page)); + spin_lock_irqsave(&n->list_lock, flags); + __unfreeze_partial(s, n, page, &discard_page); + spin_unlock_irqrestore(&n->list_lock, flags); + + if (discard_page) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, discard_page); + stat(s, FREE_SLAB); + } +} +#endif + /* * Unfreeze all the cpu partial slabs. * @@ -1962,10 +2028,10 @@ static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; - while ((page = c->partial)) { - struct page new; - struct page old; + if (c->partial == CPU_SLAB_PARTIAL_DEAD) + return; + while ((page = c->partial)) { c->partial = page->next; n2 = get_node(s, page_to_nid(page)); @@ -1977,29 +2043,7 @@ static void unfreeze_partials(struct kmem_cache *s, spin_lock(&n->list_lock); } - do { - - old.freelist = page->freelist; - old.counters = page->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__cmpxchg_double_slab(s, page, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { - page->next = discard_page; - discard_page = page; - } else { - add_partial(n, page, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } + __unfreeze_partial(s, n, page, &discard_page); } if (n) @@ -2053,6 +2097,15 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page) do { oldpage = this_cpu_read(s->cpu_slab->partial); + /* + * Per cpu partials are disabled. Unfreeze the slab and put it + * to the node partial list. + */ + if (oldpage == CPU_SLAB_PARTIAL_DEAD) { + cancel_frozen(s, page); + break; + } + if (oldpage) { if (oldpage->pobjects > s->cpu_partial) { unsigned long flags; @@ -2099,6 +2152,13 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) flush_slab(s, c); unfreeze_partials(s, c); + + /* + * Disable per cpu partials if the cache is dead so that it + * won't be pinned by empty slabs that can be cached there. + */ + if (memcg_cache_dead(s)) + c->partial = CPU_SLAB_PARTIAL_DEAD; } } @@ -2368,6 +2428,7 @@ new_slab: if (c->partial) { page = c->page = c->partial; + VM_BUG_ON(page == CPU_SLAB_PARTIAL_DEAD); c->partial = page->next; stat(s, CPU_PARTIAL_ALLOC); c->freelist = NULL; @@ -3416,6 +3477,10 @@ void __kmem_cache_shrink(struct kmem_cache *s) struct list_head *slabs_by_inuse; unsigned long flags; + /* Make dead caches discard empty slabs immediately. */ + if (memcg_cache_dead(s)) + s->min_partial = 0; + slabs_by_inuse = kcalloc(objects - 1, sizeof(struct list_head), GFP_KERNEL); if (slabs_by_inuse) { @@ -4329,7 +4394,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; page = ACCESS_ONCE(c->partial); - if (page) { + if (page && page != CPU_SLAB_PARTIAL_DEAD) { node = page_to_nid(page); if (flags & SO_TOTAL) WARN_ON_ONCE(1); @@ -4561,7 +4626,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) for_each_online_cpu(cpu) { struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; - if (page) { + if (page && page != CPU_SLAB_PARTIAL_DEAD) { pages += page->pages; objects += page->pobjects; } @@ -4573,7 +4638,8 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) for_each_online_cpu(cpu) { struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; - if (page && len < PAGE_SIZE - 20) + if (page && page != CPU_SLAB_PARTIAL_DEAD && + len < PAGE_SIZE - 20) len += sprintf(buf + len, " C%d=%d(%d)", cpu, page->pobjects, page->pages); } -- 1.7.10.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>