To achieve this: - We call update_tasks_cpumask/nodemask() for empty cpusets when hotplug happens, instead of moving tasks out of them. - When a cpuset's masks are changed by writing cpuset.cpus/mems, we also update tasks in child cpusets which are empty. Signed-off-by: Li Zefan <lizefan@xxxxxxxxxx> --- kernel/cpuset.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 146 insertions(+), 25 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 741e652..95e9394 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,9 @@ struct cpuset { */ nodemask_t old_mems_allowed; + /* used in cpuset_update_nodemask_workfn() */ + struct ptr_heap *heap; + struct fmeter fmeter; /* memory_pressure filter */ /* @@ -114,6 +117,7 @@ struct cpuset { int relax_domain_level; struct work_struct hotplug_work; + struct work_struct update_nodemask_work; }; /* Retrieve the cpuset for a cgroup */ @@ -276,6 +280,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); +static struct workqueue_struct *cpuset_update_nodemask_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -877,6 +883,39 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) cgroup_scan_tasks(&scan); } +/* + * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. + * @root_cs: the root cpuset of the hierarchy + * @update_root: update root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update cpumasks of tasks in @root_cs and all other empty cpusets + * which take on cpumask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_cpumask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup *pos_cgrp; + + if (update_root) + update_tasks_cpumask(root_cs, heap); + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + continue; + } + + update_tasks_cpumask(cp, heap); + } + rcu_read_unlock(); +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -928,11 +967,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - /* - * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. - */ - update_tasks_cpumask(cs, &heap); + update_tasks_cpumask_hier(cs, true, &heap); heap_free(&heap); @@ -1099,6 +1134,78 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) cpuset_being_rebound = NULL; } +static void cpuset_update_nodemask_workfn(struct work_struct *work) +{ + struct cpuset *cs = container_of(work, struct cpuset, + update_nodemask_work); + + update_tasks_nodemask(cs, cs->heap); + css_put(&cs->css); +} + +static void schedule_update_tasks_nodemask(struct cpuset *cs, + struct ptr_heap *heap) +{ + bool queued; + + /* Will be released when the work item finishes executing. */ + if (!css_tryget(&cs->css)) + return; + + /* + * The caller will flush the workqueue with cpuset_mutex held, + * so it's not possible a work item was already queued, and + * we're sure cs->heap is valid. + */ + cs->heap = heap; + queued = queue_work(cpuset_update_nodemask_wq, + &cs->update_nodemask_work); + if (queued) { + WARN_ON(1); + css_put(&cs->css); + } +} + +/* + * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. + * @cs: the root cpuset of the hierarchy + * @update_root: update the root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update nodemasks of tasks in @root_cs and all other empty cpusets + * which take on nodemask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_nodemask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup *pos_cgrp; + + if (update_root) + update_tasks_nodemask(root_cs, heap); + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + continue; + } + + schedule_update_tasks_nodemask(cp, heap); + } + rcu_read_unlock(); + + /* + * The only reason we use workqueue is update_tasks_nodemask() can't + * be called in rcu_read_lock(). Flush the workqueue to make sure + * all the updates are done before we return. + */ + flush_workqueue(cpuset_update_nodemask_wq); +} + /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the @@ -1163,7 +1270,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &heap); + update_tasks_nodemask_hier(cs, true, &heap); heap_free(&heap); done: @@ -1888,6 +1995,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); + INIT_WORK(&cs->update_nodemask_work, cpuset_update_nodemask_workfn); cs->relax_domain_level = -1; return &cs->css; @@ -2063,31 +2171,36 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) static nodemask_t off_mems; struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; + bool sane = cgroup_sane_behavior(cs->css.cgroup); mutex_lock(&cpuset_mutex); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - /* remove offline cpus from @cs */ - if (!cpumask_empty(&off_cpus)) { - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); - if (!cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, NULL); - } + /* + * If sane_behavior flag is set, we need to update tasks' cpumask + * for empy cpuset to take on ancestor's cpumask + */ + if ((sane && cpumask_empty(cs->cpus_allowed)) || + (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) + update_tasks_cpumask(cs, NULL); - /* remove offline mems from @cs */ - if (!nodes_empty(off_mems)) { - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); - if (!nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs, NULL); - } + /* + * If sane_behavior flag is set, we need to update tasks' nodemask + * for empy cpuset to take on ancestor's nodemask + */ + if ((sane && nodes_empty(cs->mems_allowed)) || + (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) + update_tasks_nodemask(cs, NULL); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2095,11 +2208,13 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) mutex_unlock(&cpuset_mutex); /* - * If @cs became empty, move tasks to the nearest ancestor with - * execution resources. This is full cgroup operation which will + * If sane_behavior flag is set, we'll keep tasks in empty cpusets. + * + * Otherwise move tasks to the nearest ancestor with execution + * resources. This is full cgroup operation which will * also call back into cpuset. Should be done outside any lock. */ - if (is_empty) + if (!sane && is_empty) remove_tasks_in_empty_cpuset(cs); /* the following may free @cs, should be the last operation */ @@ -2174,6 +2289,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); mutex_unlock(&callback_mutex); /* we don't mess with cpumasks of tasks in top_cpuset */ + update_tasks_cpumask_hier(&top_cpuset, false, NULL); } /* synchronize mems_allowed to N_MEMORY */ @@ -2182,6 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset, NULL); + update_tasks_nodemask_hier(&top_cpuset, false, NULL); } /* if cpus or mems went down, we need to propagate to descendants */ @@ -2261,6 +2378,10 @@ void __init cpuset_init_smp(void) cpuset_propagate_hotplug_wq = alloc_ordered_workqueue("cpuset_hotplug", 0); BUG_ON(!cpuset_propagate_hotplug_wq); + + cpuset_update_nodemask_wq = + create_workqueue("cpuset_update_nodemask"); + BUG_ON(!cpuset_update_nodemask_wq); } /** -- 1.8.0.2 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers