On Thu, Jun 29, 2023 at 5:56 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote: > > Destroying psi trigger in cgroup_file_release causes UAF issues when > a cgroup is removed from under a polling process. This is happening > because cgroup removal causes a call to cgroup_file_release while the > actual file is still alive. Destroying the trigger at this point would > also destroy its waitqueue head and if there is still a polling process > on that file accessing the waitqueue, it will step on the freed pointer: > > do_select > vfs_poll > do_rmdir > cgroup_rmdir > kernfs_drain_open_files > cgroup_file_release > cgroup_pressure_release > psi_trigger_destroy > wake_up_pollfree(&t->event_wait) > // vfs_poll is unblocked > synchronize_rcu > kfree(t) > poll_freewait -> UAF access to the trigger's waitqueue head > > Patch [1] fixed this issue for epoll() case using wake_up_pollfree(), > however the same issue exists for synchronous poll() case. > The root cause of this issue is that the lifecycles of the psi trigger's > waitqueue and of the file associated with the trigger are different. Fix > this by using kernfs_generic_poll function when polling on cgroup-specific > psi triggers. It internally uses kernfs_open_node->poll waitqueue head > with its lifecycle tied to the file's lifecycle. This also renders the > fix in [1] obsolete, so revert it. > > [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()") > > Fixes: 0e94682b73bf ("psi: introduce psi monitor") > Reported-by: Lu Jialin <lujialin4@xxxxxxxxxx> > Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@xxxxxxxxxx/ > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> This patch is a replacement for the patchset posted at https://lore.kernel.org/all/20230626201713.1204982-1-surenb@xxxxxxxxxx/ The original patchset was more appropriate for Tejun's tree but this one is mostly touching PSI-related code, so I changed the recipient to PeterZ. That said, I would still greatly appreciate inputs from Tejun and Greg, and anyone else of course. Thanks, Suren. > --- > include/linux/psi.h | 5 +++-- > include/linux/psi_types.h | 3 +++ > kernel/cgroup/cgroup.c | 2 +- > kernel/sched/psi.c | 29 +++++++++++++++++++++-------- > 4 files changed, 28 insertions(+), 11 deletions(-) > > diff --git a/include/linux/psi.h b/include/linux/psi.h > index ab26200c2803..e0745873e3f2 100644 > --- a/include/linux/psi.h > +++ b/include/linux/psi.h > @@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags); > void psi_memstall_leave(unsigned long *flags); > > int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); > -struct psi_trigger *psi_trigger_create(struct psi_group *group, > - char *buf, enum psi_res res, struct file *file); > +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, > + enum psi_res res, struct file *file, > + struct kernfs_open_file *of); > void psi_trigger_destroy(struct psi_trigger *t); > > __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, > diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h > index 040c089581c6..f1fd3a8044e0 100644 > --- a/include/linux/psi_types.h > +++ b/include/linux/psi_types.h > @@ -137,6 +137,9 @@ struct psi_trigger { > /* Wait queue for polling */ > wait_queue_head_t event_wait; > > + /* Kernfs file for cgroup triggers */ > + struct kernfs_open_file *of; > + > /* Pending event flag */ > int event; > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index bfe3cd8ccf36..f55a40db065f 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, > } > > psi = cgroup_psi(cgrp); > - new = psi_trigger_create(psi, buf, res, of->file); > + new = psi_trigger_create(psi, buf, res, of->file, of); > if (IS_ERR(new)) { > cgroup_put(cgrp); > return PTR_ERR(new); > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c > index 81fca77397f6..9bb3f2b3ccfc 100644 > --- a/kernel/sched/psi.c > +++ b/kernel/sched/psi.c > @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, > continue; > > /* Generate an event */ > - if (cmpxchg(&t->event, 0, 1) == 0) > - wake_up_interruptible(&t->event_wait); > + if (cmpxchg(&t->event, 0, 1) == 0) { > + if (t->of) > + kernfs_notify(t->of->kn); > + else > + wake_up_interruptible(&t->event_wait); > + } > t->last_event_time = now; > /* Reset threshold breach flag once event got generated */ > t->pending_event = false; > @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) > return 0; > } > > -struct psi_trigger *psi_trigger_create(struct psi_group *group, > - char *buf, enum psi_res res, struct file *file) > +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, > + enum psi_res res, struct file *file, > + struct kernfs_open_file *of) > { > struct psi_trigger *t; > enum psi_states state; > @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, > > t->event = 0; > t->last_event_time = 0; > - init_waitqueue_head(&t->event_wait); > + t->of = of; > + if (!of) > + init_waitqueue_head(&t->event_wait); > t->pending_event = false; > t->aggregator = privileged ? PSI_POLL : PSI_AVGS; > > @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t) > * being accessed later. Can happen if cgroup is deleted from under a > * polling process. > */ > - wake_up_pollfree(&t->event_wait); > + if (t->of) > + kernfs_notify(t->of->kn); > + else > + wake_up_interruptible(&t->event_wait); > > if (t->aggregator == PSI_AVGS) { > mutex_lock(&group->avgs_lock); > @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr, > if (!t) > return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; > > - poll_wait(file, &t->event_wait, wait); > + if (t->of) > + kernfs_generic_poll(t->of, wait); > + else > + poll_wait(file, &t->event_wait, wait); > > if (cmpxchg(&t->event, 1, 0) == 1) > ret |= EPOLLPRI; > @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, > return -EBUSY; > } > > - new = psi_trigger_create(&psi_system, buf, res, file); > + new = psi_trigger_create(&psi_system, buf, res, file, NULL); > if (IS_ERR(new)) { > mutex_unlock(&seq->lock); > return PTR_ERR(new); > -- > 2.41.0.255.g8b1d071c50-goog >