Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

Li Zefan <lizefan@xxxxxxxxxx> · Fri, 15 Aug 2014 11:13:05 +0800

On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>  
>>>  	/*
>>>  	 * Scan zonelist, looking for a zone with enough free.
>>> -	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> +	 * See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>>  	 */
>>>  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>>  						high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>>  				continue;
>>>  		if (cpusets_enabled() &&
>>>  			(alloc_flags & ALLOC_CPUSET) &&
>>> -			!cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> +			!cpuset_zone_allowed(zone, gfp_mask))
>>>  				continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
> 
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath 
> is that kswapd gets woken up before the allocation is attempted without 
> ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
> succeed on any node.  Even with the patch, if the allocation fails because 
> all nodes are below their min watermark, then we still fallback to the 
> slowpath and wake up kswapd but there's nothing much else we can do 
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson <pj@xxxxxxx>
Date:   Wed Dec 13 00:34:25 2006 -0800

    [PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+	unsigned long flags;
 	struct cpuset *cp;
 	struct cgroup_subsys_state *pos_css;
 	bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 			continue;
 		rcu_read_unlock();
 
-		mutex_lock(&callback_mutex);
+		spin_lock_irqsave(&callback_lock, flags);
 		cpumask_copy(cp->effective_cpus, new_cpus);
-		mutex_unlock(&callback_mutex);
+		spin_unlock_irqrestore(&callback_lock, flags);
 
 		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			  const char *buf)
 {
+	unsigned long flags;
 	int retval;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		return retval;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	/* use trialcs->cpus_allowed as a temp variable */
 	update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1105,6 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
  */
 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 {
+	unsigned long flags;
 	struct cpuset *cp;
 	struct cgroup_subsys_state *pos_css;
 
@@ -1132,8 +1135,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 		rcu_read_unlock();
 
 		mutex_lock(&callback_mutex);
+		spin_lock_irqsave(&callback_lock, flags);
 		cp->effective_mems = *new_mems;
-		mutex_unlock(&callback_mutex);
+		spin_unlock_irqrestore(&callback_lock, flags);
 
 		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1162,6 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
+	unsigned long flags;
 	int retval;
 
 	/*
@@ -1201,9 +1206,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cs->mems_allowed = trialcs->mems_allowed;
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	/* use trialcs->mems_allowed as a temp variable */
 	update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1264,6 +1269,7 @@ static void update_tasks_flags(struct cpuset *cs)
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
+	unsigned long flags;
 	struct cpuset *trialcs;
 	int balance_flag_changed;
 	int spread_flag_changed;
@@ -1288,9 +1294,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cs->flags = trialcs->flags;
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		rebuild_sched_domains_locked();
@@ -1690,11 +1696,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	ssize_t count;
 	char *buf, *s;
 	int ret = 0;
+	unsigned long flags;
 
 	count = seq_get_buf(sf, &buf);
 	s = buf;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_mutex, flags);
 
 	switch (type) {
 	case FILE_CPULIST:
@@ -1721,7 +1728,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 		seq_commit(sf, -1);
 	}
 out_unlock:
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqstore(&callback_lock, flags);
 	return ret;
 }
 
@@ -1924,6 +1931,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	struct cpuset *parent = parent_cs(cs);
 	struct cpuset *tmp_cs;
 	struct cgroup_subsys_state *pos_css;
+	unsigned long flags;
 
 	if (!parent)
 		return 0;
@@ -1938,12 +1946,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 
 	cpuset_inc();
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	if (cgroup_on_dfl(cs->css.cgroup)) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
 	}
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
 		goto out_unlock;
@@ -1970,10 +1978,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	}
 	rcu_read_unlock();
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cs->mems_allowed = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-	mutex_unlock(&callback_mutex);
+	spin_lock_irqrestore(&callback_lock, flags);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return 0;
@@ -2011,8 +2019,10 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
+	unsigned long flags;
+
 	mutex_lock(&cpuset_mutex);
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 
 	if (cgroup_on_dfl(root_css->cgroup)) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2023,7 +2033,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
 	}
 
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -2107,13 +2117,14 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 			    bool cpus_updated, bool mems_updated)
 {
 	bool is_empty;
+	unsigned long flags;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cpumask_copy(cs->cpus_allowed, new_cpus);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->mems_allowed = *new_mems;
 	cs->effective_mems = *new_mems;
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	/*
 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2145,15 +2156,17 @@ hotplug_update_tasks(struct cpuset *cs,
 		     struct cpumask *new_cpus, nodemask_t *new_mems,
 		     bool cpus_updated, bool mems_updated)
 {
+	unsigned long flags;
+
 	if (cpumask_empty(new_cpus))
 		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
 	if (nodes_empty(*new_mems))
 		*new_mems = parent_cs(cs)->effective_mems;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->effective_mems = *new_mems;
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	if (cpus_updated)
 		update_tasks_cpumask(cs);
@@ -2227,6 +2240,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
 	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+	unsigned long flags;
 
 	mutex_lock(&cpuset_mutex);
 
@@ -2239,21 +2253,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 
 	/* synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
-		mutex_lock(&callback_mutex);
+		spin_lock_irqsave(&callback_lock, flags);
 		if (!on_dfl)
 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-		mutex_unlock(&callback_mutex);
+		spin_unlock_irqrestore(&callback_lock, flags);
 		/* we don't mess with cpumasks of tasks in top_cpuset */
 	}
 
 	/* synchronize mems_allowed to N_MEMORY */
 	if (mems_updated) {
-		mutex_lock(&callback_mutex);
+		spin_lock_irqsave(&callback_lock, flags);
 		if (!on_dfl)
 			top_cpuset.mems_allowed = new_mems;
 		top_cpuset.effective_mems = new_mems;
-		mutex_unlock(&callback_mutex);
+		spin_unlock_irqrestore(&callback_lock, flags);
 		update_tasks_nodemask(&top_cpuset);
 	}
 
@@ -2346,11 +2360,13 @@ void __init cpuset_init_smp(void)
 
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-	mutex_lock(&callback_mutex);
+	unsigned long flags;
+
+	spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_cpus(task_cs(tsk), pmask);
 	rcu_read_unlock();
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 }
 
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2396,12 +2412,13 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
 	nodemask_t mask;
+	unsigned long flags;
 
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_mems(task_cs(tsk), &mask);
 	rcu_read_unlock();
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 
 	return mask;
 }
@@ -2514,14 +2531,14 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 		return 1;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	mutex_lock(&callback_mutex);
+	spin_lock_irqsave(&callback_mutex, flags);
 
 	rcu_read_lock();
 	cs = nearest_hardwall_ancestor(task_cs(current));
 	allowed = node_isset(node, cs->mems_allowed);
 	rcu_read_unlock();
 
-	mutex_unlock(&callback_mutex);
+	spin_unlock_irqrestore(&callback_lock, flags);
 	return allowed;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>