>From f9f9e7b776142fb1c0782cade004cc8e0147a199 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@xxxxxxxxxx> Date: Wed, 16 Sep 2015 11:51:12 -0400 This reverts commit b5ba75b5fc0e8404e2c50cb68f39bb6a53fc916f. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Link: http://lkml.kernel.org/g/55F8097A.7000206@xxxxxxxxxx Reported-by: Christian Borntraeger <borntraeger@xxxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> Cc: stable@xxxxxxxxxxxxxxx # v4.2+ --- Hello, These are the two reverts that I'm pushing through cgroup/for-4.3-fixes. I'll re-apply the reverted patches on the for-4.4 branch so that they can land together with percpu_rwsem updates during the next merge window. Thanks. kernel/cgroup.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79..115091e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2460,13 +2460,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; - percpu_down_write(&cgroup_threadgroup_rwsem); +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { + rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_rcu; + goto out_unlock_cgroup; } } else { tsk = current; @@ -2482,23 +2483,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - goto out_unlock_rcu; + rcu_read_unlock(); + goto out_unlock_cgroup; } get_task_struct(tsk); rcu_read_unlock(); + percpu_down_write(&cgroup_threadgroup_rwsem); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(tsk); + goto retry_find_task; + } + } + ret = cgroup_procs_write_permission(tsk, cgrp, of); if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - put_task_struct(tsk); - goto out_unlock_threadgroup; - -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + + put_task_struct(tsk); +out_unlock_cgroup: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2643,8 +2658,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2700,8 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; + percpu_down_write(&cgroup_threadgroup_rwsem); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(task); + continue; + } + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2711,7 +2733,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) out_finish: cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html