Re: [PATCH] cgroup: use an ordered workqueue for cgroup destruction

Michal Hocko <mhocko@xxxxxxx> · Mon, 10 Feb 2014 16:46:48 +0100

On Fri 07-02-14 11:43:21, Johannes Weiner wrote:
[...]
> Long-term, I think we may want to get rid of the reparenting in
> css_offline() entirely and only do it in the naturally ordered
> css_free() callback.  We only reparent in css_offline() because
> swapout records pin the css and we don't want to hang on to
> potentially gigabytes of unreclaimable (css_tryget() disabled) cache
> indefinitely until the last swapout records disappear.
> 
> So I'm currently mucking around with the following patch, which drops
> the css pin from swapout records and reparents them in css_free().
> It's lightly tested and there might well be dragons but I don't see
> any fundamental problems with it.
> 
> What do you think?

Hugh has posted something like this back in December
(http://marc.info/?l=linux-mm&m=138718299304670&w=2).
I am not entirely happy about scanning potentially huge amount of 
swap records.

A trivial optimization would check the memsw counter and break out early
but it would still leave a potentially full scan possible. I guess we
shouldn't care much about when css_free is called. I think we should
split the reparenting into two parts. LRU reparent without any hard
requirements and charges reparent which guarantees that nothing is left
behind.
The first one called css_offline and the second one from css_free.

kmem accounting pins memcg as well btw so taking care of swap is not
sufficient to guarantee an early css_free.

> ---
>  include/linux/page_cgroup.h |   8 ++++
>  mm/memcontrol.c             | 101 +++++++++++++-------------------------------
>  mm/page_cgroup.c            |  41 ++++++++++++++++++
>  3 files changed, 78 insertions(+), 72 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 777a524716db..3ededda8934f 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -111,6 +111,8 @@ extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
>  					unsigned short old, unsigned short new);
>  extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
>  extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
> +extern unsigned long swap_cgroup_migrate(unsigned short old,
> +					 unsigned short new);
>  extern int swap_cgroup_swapon(int type, unsigned long max_pages);
>  extern void swap_cgroup_swapoff(int type);
>  #else
> @@ -127,6 +129,12 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
>  	return 0;
>  }
>  
> +static inline unsigned long swap_cgroup_migrate(unsigned short old,
> +						unsigned short new)
> +{
> +	return 0;
> +}
> +
>  static inline int
>  swap_cgroup_swapon(int type, unsigned long max_pages)
>  {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 53385cd4e6f0..e2a0d3986c74 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -892,11 +892,9 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
>  	return val;
>  }
>  
> -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
> -					 bool charge)
> +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, long nr_pages)
>  {
> -	int val = (charge) ? 1 : -1;
> -	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
> +	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], nr_pages);
>  }
>  
>  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
> @@ -4234,15 +4232,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
>  	 */
>  
>  	unlock_page_cgroup(pc);
> -	/*
> -	 * even after unlock, we have memcg->res.usage here and this memcg
> -	 * will never be freed, so it's safe to call css_get().
> -	 */
> +
>  	memcg_check_events(memcg, page);
> -	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
> -		mem_cgroup_swap_statistics(memcg, true);
> -		css_get(&memcg->css);
> -	}
> +
> +	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> +		mem_cgroup_swap_statistics(memcg, 1);
> +
>  	/*
>  	 * Migration does not charge the res_counter for the
>  	 * replacement page, so leave it alone when phasing out the
> @@ -4351,10 +4346,6 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
>  
>  	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
>  
> -	/*
> -	 * record memcg information,  if swapout && memcg != NULL,
> -	 * css_get() was called in uncharge().
> -	 */
>  	if (do_swap_account && swapout && memcg)
>  		swap_cgroup_record(ent, mem_cgroup_id(memcg));
>  }
> @@ -4383,8 +4374,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
>  		 */
>  		if (!mem_cgroup_is_root(memcg))
>  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> -		mem_cgroup_swap_statistics(memcg, false);
> -		css_put(&memcg->css);
> +		mem_cgroup_swap_statistics(memcg, -1);
>  	}
>  	rcu_read_unlock();
>  }
> @@ -4412,20 +4402,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
>  	new_id = mem_cgroup_id(to);
>  
>  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
> -		mem_cgroup_swap_statistics(from, false);
> -		mem_cgroup_swap_statistics(to, true);
> -		/*
> -		 * This function is only called from task migration context now.
> -		 * It postpones res_counter and refcount handling till the end
> -		 * of task migration(mem_cgroup_clear_mc()) for performance
> -		 * improvement. But we cannot postpone css_get(to)  because if
> -		 * the process that has been moved to @to does swap-in, the
> -		 * refcount of @to might be decreased to 0.
> -		 *
> -		 * We are in attach() phase, so the cgroup is guaranteed to be
> -		 * alive, so we can just call css_get().
> -		 */
> -		css_get(&to->css);
> +		mem_cgroup_swap_statistics(from, -1);
> +		mem_cgroup_swap_statistics(to, 1);
>  		return 0;
>  	}
>  	return -EINVAL;
> @@ -6611,7 +6589,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  	kmem_cgroup_css_offline(memcg);
>  
>  	mem_cgroup_invalidate_reclaim_iterators(memcg);
> -	mem_cgroup_reparent_charges(memcg);
>  	mem_cgroup_destroy_all_caches(memcg);
>  	vmpressure_cleanup(&memcg->vmpressure);
>  }
> @@ -6619,41 +6596,26 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +	unsigned long swaps_moved;
> +	struct mem_cgroup *parent;
> +
>  	/*
> -	 * XXX: css_offline() would be where we should reparent all
> -	 * memory to prepare the cgroup for destruction.  However,
> -	 * memcg does not do css_tryget() and res_counter charging
> -	 * under the same RCU lock region, which means that charging
> -	 * could race with offlining.  Offlining only happens to
> -	 * cgroups with no tasks in them but charges can show up
> -	 * without any tasks from the swapin path when the target
> -	 * memcg is looked up from the swapout record and not from the
> -	 * current task as it usually is.  A race like this can leak
> -	 * charges and put pages with stale cgroup pointers into
> -	 * circulation:
> -	 *
> -	 * #0                        #1
> -	 *                           lookup_swap_cgroup_id()
> -	 *                           rcu_read_lock()
> -	 *                           mem_cgroup_lookup()
> -	 *                           css_tryget()
> -	 *                           rcu_read_unlock()
> -	 * disable css_tryget()
> -	 * call_rcu()
> -	 *   offline_css()
> -	 *     reparent_charges()
> -	 *                           res_counter_charge()
> -	 *                           css_put()
> -	 *                             css_free()
> -	 *                           pc->mem_cgroup = dead memcg
> -	 *                           add page to lru
> -	 *
> -	 * The bulk of the charges are still moved in offline_css() to
> -	 * avoid pinning a lot of pages in case a long-term reference
> -	 * like a swapout record is deferring the css_free() to long
> -	 * after offlining.  But this makes sure we catch any charges
> -	 * made after offlining:
> +	 * Migrate all swap entries to the parent.  There are no more
> +	 * references to the css, so no new swap out records can show
> +	 * up.  Any concurrently faulting pages will either get this
> +	 * offline cgroup and charge the faulting task, or get the
> +	 * migrated parent id and charge the parent for the in-memory
> +	 * page.  uncharge_swap() will balance the res_counter in the
> +	 * parent either way, whether it still fixes this group's
> +	 * res_counter is irrelevant at this point.
>  	 */
> +	parent = parent_mem_cgroup(memcg);
> +	if (!parent)
> +		parent = root_mem_cgroup;
> +	swaps_moved = swap_cgroup_migrate(mem_cgroup_id(memcg),
> +					  mem_cgroup_id(parent));
> +	mem_cgroup_swap_statistics(parent, swaps_moved);
> +
>  	mem_cgroup_reparent_charges(memcg);
>  
>  	memcg_destroy_kmem(memcg);
> @@ -6966,7 +6928,6 @@ static void __mem_cgroup_clear_mc(void)
>  {
>  	struct mem_cgroup *from = mc.from;
>  	struct mem_cgroup *to = mc.to;
> -	int i;
>  
>  	/* we must uncharge all the leftover precharges from mc.to */
>  	if (mc.precharge) {
> @@ -6981,16 +6942,13 @@ static void __mem_cgroup_clear_mc(void)
>  		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
>  		mc.moved_charge = 0;
>  	}
> -	/* we must fixup refcnts and charges */
> +	/* we must fixup charges */
>  	if (mc.moved_swap) {
>  		/* uncharge swap account from the old cgroup */
>  		if (!mem_cgroup_is_root(mc.from))
>  			res_counter_uncharge(&mc.from->memsw,
>  						PAGE_SIZE * mc.moved_swap);
>  
> -		for (i = 0; i < mc.moved_swap; i++)
> -			css_put(&mc.from->css);
> -
>  		if (!mem_cgroup_is_root(mc.to)) {
>  			/*
>  			 * we charged both to->res and to->memsw, so we should
> @@ -6999,7 +6957,6 @@ static void __mem_cgroup_clear_mc(void)
>  			res_counter_uncharge(&mc.to->res,
>  						PAGE_SIZE * mc.moved_swap);
>  		}
> -		/* we've already done css_get(mc.to) */
>  		mc.moved_swap = 0;
>  	}
>  	memcg_oom_recover(from);
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index cfd162882c00..ca04feaae7fe 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -459,6 +459,47 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
>  	return lookup_swap_cgroup(ent, NULL)->id;
>  }
>  
> +/**
> + * swap_cgroup_migrate - migrate all entries of one id to another
> + * @old: old id
> + * @new: new id
> + *
> + * Caller has to be able to deal with swapon/swapoff racing.
> + *
> + * Returns number of migrated entries.
> + */
> +unsigned long swap_cgroup_migrate(unsigned short old, unsigned short new)
> +{
> +	unsigned long migrated = 0;
> +	unsigned int type;
> +
> +	for (type = 0; type < MAX_SWAPFILES; type++) {
> +		struct swap_cgroup_ctrl *ctrl;
> +		unsigned long flags;
> +		unsigned int page;
> +
> +		ctrl = &swap_cgroup_ctrl[type];
> +		spin_lock_irqsave(&ctrl->lock, flags);
> +		for (page = 0; page < ctrl->length; page++) {
> +			struct swap_cgroup *base;
> +			pgoff_t offset;
> +
> +			base = page_address(ctrl->map[page]);
> +			for (offset = 0; offset < SC_PER_PAGE; offset++) {
> +				struct swap_cgroup *sc;
> +
> +				sc = base + offset;
> +				if (sc->id == old) {
> +					sc->id = new;
> +					migrated++;
> +				}
> +			}
> +		}
> +		spin_unlock_irqrestore(&ctrl->lock, flags);
> +	}
> +	return migrated;
> +}
> +
>  int swap_cgroup_swapon(int type, unsigned long max_pages)
>  {
>  	void *array;
> -- 
> 1.8.5.3
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>