Re: Bug report: Unable to handle kernel paging request at virtual address 00000000c0000010

Xiu Jianfeng <xiujianfeng@xxxxxxxxxxxxxxx> · Thu, 25 May 2023 20:21:11 +0800

How about this solution? we stop task iteration when moving css_set to a
new list in rebind_subsystems(), patch is as bellow:

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 567c547cf371..3f1557cb5758 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -47,6 +47,7 @@ struct kernel_clone_args;

 /* internal flags */
 #define CSS_TASK_ITER_SKIPPED		(1U << 16)
+#define CSS_TASK_ITER_STOPPED		(1U << 17)

 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f10cef511ffa..091c9a38d0c7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -240,6 +240,8 @@ static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
 			       struct task_struct *task);
+static void css_task_iter_stop(struct css_task_iter *it,
+			       struct css_set *cset);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss);
@@ -889,6 +891,19 @@ static void css_set_skip_task_iters(struct css_set
*cset,
 		css_task_iter_skip(it, task);
 }

+/*
+ * @cset is moving to other list, it's not safe to continue the iterator,
+ * because the cset_head and css_pos of css_task_iter does not make sense
+ * for the new list.
+ */
+static void css_set_stop_iters(struct css_set *cset)
+{
+	struct css_task_iter *it, *pos;
+
+	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+		css_task_iter_stop(it, cset);
+}
+
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -1861,9 +1876,11 @@ int rebind_subsystems(struct cgroup_root
*dst_root, u16 ss_mask)
 		css->cgroup = dcgrp;

 		spin_lock_irq(&css_set_lock);
-		hash_for_each(css_set_table, i, cset, hlist)
+		hash_for_each(css_set_table, i, cset, hlist) {
+			css_set_stop_iters(cset);
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
+		}
 		spin_unlock_irq(&css_set_lock);

 		if (ss->css_rstat_flush) {
@@ -4866,6 +4883,15 @@ static void css_task_iter_skip(struct
css_task_iter *it,
 	}
 }

+static void css_task_iter_stop(struct css_task_iter *it,
+			       struct css_set *cset)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	WARN_ONCE(it->cur_cset != cset, "invalid cur_set\n");
+	it->flags |= CSS_TASK_ITER_STOPPED;
+}
+
 static void css_task_iter_advance(struct css_task_iter *it)
 {
 	struct task_struct *task;
@@ -4969,6 +4995,9 @@ struct task_struct *css_task_iter_next(struct
css_task_iter *it)

 	spin_lock_irq(&css_set_lock);

+	if (it->flags & CSS_TASK_ITER_STOPPED)
+		goto unlock;
+
 	/* @it may be half-advanced by skips, finish advancing */
 	if (it->flags & CSS_TASK_ITER_SKIPPED)
 		css_task_iter_advance(it);
@@ -4980,6 +5009,7 @@ struct task_struct *css_task_iter_next(struct
css_task_iter *it)
 		css_task_iter_advance(it);
 	}

+unlock:
 	spin_unlock_irq(&css_set_lock);

 	return it->cur_task;



On 2023/5/24 20:13, cuigaosheng wrote:
> 
> Hi,everybody
> 
> There is a bug in the mainline
> code(https://kernel.googlesource.com/pub/scm/linux/kernel/git/stable/linux.git -b master).
> 
> The bug's call trace as follows:
>> refcount_t: addition on 0; use-after-free.
>>   WARNING: CPU: 1 PID: 342 at lib/refcount.c:25
>> refcount_warn_saturate+0xa0/0x148
>>   Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.79 12/28/2022
>>   Workqueue: events cpuset_hotplug_workfn
>>   Call trace:
>>    refcount_warn_saturate+0xa0/0x148
>>    __refcount_add.constprop.0+0x5c/0x80
>>    css_task_iter_advance_css_set+0xd8/0x210
>>    css_task_iter_advance+0xa8/0x120
>>    css_task_iter_next+0x94/0x158
>>    update_tasks_root_domain+0x58/0x98
>>    rebuild_root_domains+0xa0/0x1b0
>>    rebuild_sched_domains_locked+0x144/0x188
>>    cpuset_hotplug_workfn+0x138/0x5a0
>>    process_one_work+0x1e8/0x448
>>    worker_thread+0x228/0x3e0
>>    kthread+0xe0/0xf0
>>    ret_from_fork+0x10/0x20
>>   ---[ end trace 0000000000000000 ]---
>>   ------------[ cut here ]------------
>>   refcount_t: underflow; use-after-free.
>>   WARNING: CPU: 1 PID: 342 at lib/refcount.c:28
>> refcount_warn_saturate+0xf4/0x148
>>   Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.79 12/28/2022
>>   Workqueue: events cpuset_hotplug_workfn
>>   Call trace:
>>    refcount_warn_saturate+0xf4/0x148
>>    put_css_set_locked+0x80/0x98
>>    css_task_iter_end+0x70/0x160
>>    update_tasks_root_domain+0x68/0x98
>>    rebuild_root_domains+0xa0/0x1b0
>>    rebuild_sched_domains_locked+0x144/0x188
>>    cpuset_hotplug_workfn+0x138/0x5a0
>>    process_one_work+0x1e8/0x448
>>    worker_thread+0x228/0x3e0
>>    kthread+0xe0/0xf0
>>    ret_from_fork+0x10/0x20
>>   ---[ end trace 0000000000000000 ]---
>>   process 10324 (cpuhotplug_do_s) no longer affine to cpu1
>>   psci: CPU1 killed (polled 0 ms)
>>   Unable to handle kernel paging request at virtual address
>> 00000000c0000010
>>   Internal error: Oops: 0000000096000004 [#1] SMP
>>   Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.79 12/28/2022
>>   Workqueue: cgroup_destroy css_free_rwork_fn
>>   Call trace:
>>    cgroup_apply_control_disable+0xb0/0x1f8
>>    rebind_subsystems+0x20c/0x548
>>    cgroup_destroy_root+0x64/0x240
>>    css_free_rwork_fn+0x18c/0x1a8
>>    process_one_work+0x1e8/0x448
>>    worker_thread+0x178/0x3e0
>>    kthread+0xe0/0xf0
>>    ret_from_fork+0x10/0x20
>>   Code: 91012842 8b020f62 f9400453 b4000293 (f9400a60)
>>   SMP: stopping secondary CPUs
>>   Starting crashdump kernel...
> This bug occurs in concurrency scenarios, In the hotplug,
> update_tasks_root_domain will
> iterate over all tasks on the cpuset/root domain, the code as follows:
>> static void update_tasks_root_domain(struct cpuset *cs)
>> {
>>          struct css_task_iter it;
>>          struct task_struct *task;
>>
>>          css_task_iter_start(&cs->css, 0, &it); // hold css_set_lock
>> in css_task_iter_start
>>                  ... //nolock time1: don't hold css_set_lock
>>          while ((task = css_task_iter_next(&it))) // hold css_set_lock
>> in css_task_iter_next
>>                  dl_add_task_root_domain(task); //nolock time2: don't
>> hold css_set_lock
>>
>>          css_task_iter_end(&it);
>> }
> The cgroup.e_csets will be traversed through css_task_iter, and
> it->cset_head will record
> the head of the e_cset list that is currently traversed, we will hold
> css_set_lock in
> css_task_iter_start or in css_task_iter_next, but we don't always hold
> the css_set_lock,
> such as "nolock time1" and "nolock time2" in the code comments above.
> 
> During the time without css_set_lock in update_tasks_root_domain, if
> it->cur_cset(current css_set)
> is migrated to another list, such as:
>> int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
>> {
>>      ...
>>       spin_lock_irq(&css_set_lock);
>>       hash_for_each(css_set_table, i, cset, hlist)
>>          list_move_tail(&cset->e_cset_node[ss->id],
>> &dcgrp->e_csets[ss->id]);
>>      spin_unlock_irq(&css_set_lock);
>>      ...
>> }
> The bug will be triggered. As follows:
> 
> #1> in css_task_iter_start(), it->cset_head =
> &css->cgroup->e_csets[css->ss->id]; list A
> #2> in css_task_iter_next(&it), it->cur_cset=nodeA，return task
> #3> move nodeA to listB, for example:
> rebind_subsystems(),list_move_tail(nodeA, listB),then nodeA->next = headB
> #4> next css_task_iter_next, new = nodeA->next == headB
> #5> headB is not a valid css_set, but now new != it->cset_head(nodeA),
> so headB will be referred to as a valid css_set
> #6> get_css_set(headB), refcount warning
> 
> The following changes will increase the probability of this bug being
> triggered:
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index e4ca2dd2b764..120e0c23517f 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -66,6 +66,7 @@
>>   #include <linux/mutex.h>
>>   #include <linux/cgroup.h>
>>   #include <linux/wait.h>
>> +#include <linux/delay.h>
>>
>>   DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
>>   DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
>> @@ -1073,8 +1074,10 @@ static void update_tasks_root_domain(struct
>> cpuset *cs)
>>
>>          css_task_iter_start(&cs->css, 0, &it);
>>
>> -       while ((task = css_task_iter_next(&it)))
>> +       while ((task = css_task_iter_next(&it))) {
>> +               udelay(1000 * 10);
>>                  dl_add_task_root_domain(task);
>> +       }
>>
>>          css_task_iter_end(&it);
>>   }
> 
> We can trigger this bug with ltp test
> cases(https://github.com/linux-test-project/ltp/blob/master/runtest/controllers):
> 
> step 1: create a process to execute the following usecases:
> cpuhotplug02 cpuhotplug02.sh -c 1 -l 1
> cpuhotplug03 cpuhotplug03.sh -c 1 -l 1
> cpuhotplug04 cpuhotplug04.sh -l 1
> cpuhotplug05 cpuhotplug05.sh -c 1 -l 1 -d /tmp
> cpuhotplug06 cpuhotplug06.sh -c 1 -l 1
> cpuhotplug07 cpuhotplug07.sh -c 1 -l 1 -d /usr/src/linux
> 
> step 2: create another process to execute the following usecases:
> cpuset_base_ops cpuset_base_ops_testset.sh
> cpuset_inherit cpuset_inherit_testset.sh
> cpuset_exclusive cpuset_exclusive_test.sh
> cpuset_hierarchy cpuset_hierarchy_test.sh
> cpuset_syscall cpuset_syscall_testset.sh
> cpuset_sched_domains cpuset_sched_domains_test.sh
> cpuset_load_balance cpuset_load_balance_test.sh
> cpuset_hotplug cpuset_hotplug_test.sh
> cpuset_memory cpuset_memory_testset.sh
> 
> Looking forward to your reply.
> 
> Thanks.
> 
>