It wasn't explicitly documented but, when a process is being migrated, cpuset and memcg depend on cgroup_taskset_first() returning the threadgroup leader; however, this approach is somewhat ghetto and would no longer work for the planned multi-process migration. This patch introduces explicit cgroup_taskset_for_each_leader() which iterates over only the threadgroup leaders and replaces cgroup_taskset_first() usages for accessing the leader with it. This prepares both memcg and cpuset for multi-process migration. This patch also updates the documentation for cgroup_taskset_for_each() to clarify the iteration rules and removes comments mentioning task ordering in tasksets. v2: A previous patch which added threadgroup leader test was dropped. Patch updated accordingly. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Acked-by: Zefan Li <lizefan@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxx> --- include/linux/cgroup.h | 22 ++++++++++++++++++++++ kernel/cgroup.c | 11 ----------- kernel/cpuset.c | 9 ++++----- mm/memcontrol.c | 17 +++++++++++++++-- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index eb7ca55..916a1e0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -211,11 +211,33 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @tset: taskset to iterate + * + * @tset may contain multiple tasks and they may belong to multiple + * processes. When there are multiple tasks in @tset, if a task of a + * process is in @tset, all tasks of the process are in @tset. Also, all + * are guaranteed to share the same source and destination csses. + * + * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) +/** + * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset + * @leader: the loop cursor + * @tset: takset to iterate + * + * Iterate threadgroup leaders of @tset. For single-task migrations, @tset + * may not contain any. + */ +#define cgroup_taskset_for_each_leader(leader, tset) \ + for ((leader) = cgroup_taskset_first((tset)); (leader); \ + (leader) = cgroup_taskset_next((tset))) \ + if ((leader) != (leader)->group_leader) \ + ; \ + else + /* * Inline functions. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79..0b732dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2083,13 +2083,6 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - - /* - * Use move_tail so that cgroup_taskset_first() still returns the - * leader after migration. This works because cgroup_migrate() - * ensures that the dst_cset of the leader is the first on the - * tset's dst_csets list. - */ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* @@ -2285,10 +2278,6 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, if (!cset->mg_src_cgrp) goto next; - /* - * cgroup_taskset_first() must always return the leader. - * Take care to avoid disturbing the ordering. - */ list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &tset.src_csets); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 09393f6..e7afde6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1485,7 +1485,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); + struct task_struct *leader; struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; @@ -1511,12 +1511,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This - * is expensive and may sleep and should be moved outside migration - * path proper. + * Change mm for all threadgroup leaders. This is expensive and may + * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - if (thread_group_leader(leader)) { + cgroup_taskset_for_each_leader(leader, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ddaeba..32b6bfd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4829,7 +4829,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *from; - struct task_struct *p; + struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; @@ -4843,7 +4843,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, if (!move_flags) return 0; - p = cgroup_taskset_first(tset); + /* + * Multi-process migrations only happen on the default hierarchy + * where charge immigration is not used. Perform charge + * immigration if @tset contains a leader and whine if there are + * multiple. + */ + p = NULL; + cgroup_taskset_for_each_leader(leader, tset) { + WARN_ON_ONCE(p); + p = leader; + } + if (!p) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); -- 2.4.3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>