Re: [patch 4/5] mm: memcontrol: continue cache reclaim from offlined groups

Michal Hocko <mhocko@xxxxxxx> · Wed, 15 Oct 2014 17:25:55 +0200

On Tue 14-10-14 12:20:36, Johannes Weiner wrote:
> On cgroup deletion, outstanding page cache charges are moved to the
> parent group so that they're not lost and can be reclaimed during
> pressure on/inside said parent.  But this reparenting is fairly tricky
> and its synchroneous nature has led to several lock-ups in the past.
> 
> Since css iterators now also include offlined css, memcg iterators can
> be changed to include offlined children during reclaim of a group, and
> leftover cache can just stay put.

I think it would be nice to mention c2931b70a32c (cgroup: iterate
cgroup_subsys_states directly) here to have a full context about the
tryget vs tryget_online.

> There is a slight change of behavior in that charges of deleted groups
> no longer show up as local charges in the parent.  But they are still
> included in the parent's hierarchical statistics.

Thank you for pulling drain_stock cleanup out. This made the patch so
much easier to review.

> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Acked-by: Michal Hocko <mhocko@xxxxxxx>

> ---
>  mm/memcontrol.c | 218 +-------------------------------------------------------
>  1 file changed, 1 insertion(+), 217 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7551e12f8ff7..ce3ed7cc5c30 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1132,7 +1132,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  		if (css == &root->css)
>  			break;
>  
> -		if (css_tryget_online(css)) {
> +		if (css_tryget(css)) {
>  			/*
>  			 * Make sure the memcg is initialized:
>  			 * mem_cgroup_css_online() orders the the
> @@ -3299,79 +3299,6 @@ out:
>  	return ret;
>  }
>  
> -/**
> - * mem_cgroup_move_parent - moves page to the parent group
> - * @page: the page to move
> - * @pc: page_cgroup of the page
> - * @child: page's cgroup
> - *
> - * move charges to its parent or the root cgroup if the group has no
> - * parent (aka use_hierarchy==0).
> - * Although this might fail (get_page_unless_zero, isolate_lru_page or
> - * mem_cgroup_move_account fails) the failure is always temporary and
> - * it signals a race with a page removal/uncharge or migration. In the
> - * first case the page is on the way out and it will vanish from the LRU
> - * on the next attempt and the call should be retried later.
> - * Isolation from the LRU fails only if page has been isolated from
> - * the LRU since we looked at it and that usually means either global
> - * reclaim or migration going on. The page will either get back to the
> - * LRU or vanish.
> - * Finaly mem_cgroup_move_account fails only if the page got uncharged
> - * (!PageCgroupUsed) or moved to a different group. The page will
> - * disappear in the next attempt.
> - */
> -static int mem_cgroup_move_parent(struct page *page,
> -				  struct page_cgroup *pc,
> -				  struct mem_cgroup *child)
> -{
> -	struct mem_cgroup *parent;
> -	unsigned int nr_pages;
> -	unsigned long uninitialized_var(flags);
> -	int ret;
> -
> -	VM_BUG_ON(mem_cgroup_is_root(child));
> -
> -	ret = -EBUSY;
> -	if (!get_page_unless_zero(page))
> -		goto out;
> -	if (isolate_lru_page(page))
> -		goto put;
> -
> -	nr_pages = hpage_nr_pages(page);
> -
> -	parent = parent_mem_cgroup(child);
> -	/*
> -	 * If no parent, move charges to root cgroup.
> -	 */
> -	if (!parent)
> -		parent = root_mem_cgroup;
> -
> -	if (nr_pages > 1) {
> -		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
> -		flags = compound_lock_irqsave(page);
> -	}
> -
> -	ret = mem_cgroup_move_account(page, nr_pages,
> -				pc, child, parent);
> -	if (!ret) {
> -		if (!mem_cgroup_is_root(parent))
> -			css_get_many(&parent->css, nr_pages);
> -		/* Take charge off the local counters */
> -		page_counter_cancel(&child->memory, nr_pages);
> -		if (do_swap_account)
> -			page_counter_cancel(&child->memsw, nr_pages);
> -		css_put_many(&child->css, nr_pages);
> -	}
> -
> -	if (nr_pages > 1)
> -		compound_unlock_irqrestore(page, flags);
> -	putback_lru_page(page);
> -put:
> -	put_page(page);
> -out:
> -	return ret;
> -}
> -
>  #ifdef CONFIG_MEMCG_SWAP
>  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
>  					 bool charge)
> @@ -3665,105 +3592,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>  	return nr_reclaimed;
>  }
>  
> -/**
> - * mem_cgroup_force_empty_list - clears LRU of a group
> - * @memcg: group to clear
> - * @node: NUMA node
> - * @zid: zone id
> - * @lru: lru to to clear
> - *
> - * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
> - * reclaim the pages page themselves - pages are moved to the parent (or root)
> - * group.
> - */
> -static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> -				int node, int zid, enum lru_list lru)
> -{
> -	struct lruvec *lruvec;
> -	unsigned long flags;
> -	struct list_head *list;
> -	struct page *busy;
> -	struct zone *zone;
> -
> -	zone = &NODE_DATA(node)->node_zones[zid];
> -	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> -	list = &lruvec->lists[lru];
> -
> -	busy = NULL;
> -	do {
> -		struct page_cgroup *pc;
> -		struct page *page;
> -
> -		spin_lock_irqsave(&zone->lru_lock, flags);
> -		if (list_empty(list)) {
> -			spin_unlock_irqrestore(&zone->lru_lock, flags);
> -			break;
> -		}
> -		page = list_entry(list->prev, struct page, lru);
> -		if (busy == page) {
> -			list_move(&page->lru, list);
> -			busy = NULL;
> -			spin_unlock_irqrestore(&zone->lru_lock, flags);
> -			continue;
> -		}
> -		spin_unlock_irqrestore(&zone->lru_lock, flags);
> -
> -		pc = lookup_page_cgroup(page);
> -
> -		if (mem_cgroup_move_parent(page, pc, memcg)) {
> -			/* found lock contention or "pc" is obsolete. */
> -			busy = page;
> -		} else
> -			busy = NULL;
> -		cond_resched();
> -	} while (!list_empty(list));
> -}
> -
> -/*
> - * make mem_cgroup's charge to be 0 if there is no task by moving
> - * all the charges and pages to the parent.
> - * This enables deleting this mem_cgroup.
> - *
> - * Caller is responsible for holding css reference on the memcg.
> - */
> -static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
> -{
> -	int node, zid;
> -
> -	do {
> -		/* This is for making all *used* pages to be on LRU. */
> -		lru_add_drain_all();
> -		drain_all_stock_sync(memcg);
> -		mem_cgroup_start_move(memcg);
> -		for_each_node_state(node, N_MEMORY) {
> -			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> -				enum lru_list lru;
> -				for_each_lru(lru) {
> -					mem_cgroup_force_empty_list(memcg,
> -							node, zid, lru);
> -				}
> -			}
> -		}
> -		mem_cgroup_end_move(memcg);
> -		memcg_oom_recover(memcg);
> -		cond_resched();
> -
> -		/*
> -		 * Kernel memory may not necessarily be trackable to a specific
> -		 * process. So they are not migrated, and therefore we can't
> -		 * expect their value to drop to 0 here.
> -		 * Having res filled up with kmem only is enough.
> -		 *
> -		 * This is a safety check because mem_cgroup_force_empty_list
> -		 * could have raced with mem_cgroup_replace_page_cache callers
> -		 * so the lru seemed empty but the page could have been added
> -		 * right after the check. RES_USAGE should be safe as we always
> -		 * charge before adding to the LRU.
> -		 */
> -	} while (page_counter_read(&memcg->memory) -
> -		 page_counter_read(&memcg->kmem) > 0);
> -}
> -
>  /*
>   * Test whether @memcg has children, dead or alive.  Note that this
>   * function doesn't care whether @memcg has use_hierarchy enabled and
> @@ -5306,7 +5134,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>  	struct mem_cgroup_event *event, *tmp;
> -	struct cgroup_subsys_state *iter;
>  
>  	/*
>  	 * Unregister events and notify userspace.
> @@ -5320,13 +5147,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  	}
>  	spin_unlock(&memcg->event_list_lock);
>  
> -	/*
> -	 * This requires that offlining is serialized.  Right now that is
> -	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
> -	 */
> -	css_for_each_descendant_post(iter, css)
> -		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
> -
>  	memcg_unregister_all_caches(memcg);
>  	vmpressure_cleanup(&memcg->vmpressure);
>  }
> @@ -5334,42 +5154,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	/*
> -	 * XXX: css_offline() would be where we should reparent all
> -	 * memory to prepare the cgroup for destruction.  However,
> -	 * memcg does not do css_tryget_online() and page_counter charging
> -	 * under the same RCU lock region, which means that charging
> -	 * could race with offlining.  Offlining only happens to
> -	 * cgroups with no tasks in them but charges can show up
> -	 * without any tasks from the swapin path when the target
> -	 * memcg is looked up from the swapout record and not from the
> -	 * current task as it usually is.  A race like this can leak
> -	 * charges and put pages with stale cgroup pointers into
> -	 * circulation:
> -	 *
> -	 * #0                        #1
> -	 *                           lookup_swap_cgroup_id()
> -	 *                           rcu_read_lock()
> -	 *                           mem_cgroup_lookup()
> -	 *                           css_tryget_online()
> -	 *                           rcu_read_unlock()
> -	 * disable css_tryget_online()
> -	 * call_rcu()
> -	 *   offline_css()
> -	 *     reparent_charges()
> -	 *                           page_counter_try_charge()
> -	 *                           css_put()
> -	 *                             css_free()
> -	 *                           pc->mem_cgroup = dead memcg
> -	 *                           add page to lru
> -	 *
> -	 * The bulk of the charges are still moved in offline_css() to
> -	 * avoid pinning a lot of pages in case a long-term reference
> -	 * like a swapout record is deferring the css_free() to long
> -	 * after offlining.  But this makes sure we catch any charges
> -	 * made after offlining:
> -	 */
> -	mem_cgroup_reparent_charges(memcg);
>  
>  	memcg_destroy_kmem(memcg);
>  	__mem_cgroup_free(memcg);
> -- 
> 2.1.2
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>