The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death. A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death: CPU1 CPU2 ------------------------------------------------------------------------ page_fault out_of_memory do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory) While in the oom reaper thread, we must handle the futex cleanup without sleeping. To achieve this, add the boolean `try` to futex_exit_begin(). This will control weather or not we use a trylock. Also introduce try_futex_exit_release() which will utilize the trylock version of the futex_cleanup_begin(). Also call kthread_use_mm in this context to assure the get_user call in futex_cleanup() does not return with EFAULT. If the robust list exists, and the mutex_trylock fails, we prevent the OOM reaper from concurrently reaping the mappings. If the dying task_struct does not contain a pointer in tsk->robust_list, we can assume there was either never one setup for this task struct, or the exit path's call to futex_cleanup() has properly handled the futex death, and we can safely reap this memory. Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer [1] https://elixir.bootlin.com/glibc/latest/source/nptl/allocatestack.c#L370 Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Cc: Rafael Aquini <aquini@xxxxxxxxxx> Cc: Waiman Long <longman@xxxxxxxxxx> Cc: Baoquan He <bhe@xxxxxxxxxx> Cc: Christoph von Recklinghausen <crecklin@xxxxxxxxxx> Cc: Don Dutile <ddutile@xxxxxxxxxx> Cc: Herton R. Krzesinski <herton@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Darren Hart <dvhart@xxxxxxxxxxxxx> Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx> Cc: Andre Almeida <andrealmeid@xxxxxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Co-developed-by: Joel Savitz <jsavitz@xxxxxxxxxx> Signed-off-by: Joel Savitz <jsavitz@xxxxxxxxxx> Signed-off-by: Nico Pache <npache@xxxxxxxxxx> --- include/linux/futex.h | 14 ++++++++++++++ kernel/futex/core.c | 35 +++++++++++++++++++++++++++++++---- mm/oom_kill.c | 13 +++++++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index b70df27d7e85..64d6e89294ac 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -71,9 +71,21 @@ static inline void futex_init_task(struct task_struct *tsk) mutex_init(&tsk->futex_exit_mutex); } +static inline bool task_has_robust_list(struct task_struct *tsk) +{ + bool robust = false; + + robust = tsk->robust_list != NULL; +#ifdef CONFIG_COMPAT + robust |= tsk->compat_robust_list != NULL; +#endif + return robust; +} + void futex_exit_recursive(struct task_struct *tsk); void futex_exit_release(struct task_struct *tsk); void futex_exec_release(struct task_struct *tsk); +bool try_futex_exit_release(struct task_struct *tsk); long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3); @@ -82,6 +94,8 @@ static inline void futex_init_task(struct task_struct *tsk) { } static inline void futex_exit_recursive(struct task_struct *tsk) { } static inline void futex_exit_release(struct task_struct *tsk) { } static inline void futex_exec_release(struct task_struct *tsk) { } +static inline bool task_has_robust_list(struct task_struct *tsk) { return false; } +static inline bool try_futex_exit_release(struct task_struct *tsk) { return true; } static inline long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 51dd822a8060..81aa60ce1ed6 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -37,6 +37,7 @@ #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/kthread.h> #include "futex.h" #include "../locking/rtmutex_common.h" @@ -1047,7 +1048,7 @@ void futex_exit_recursive(struct task_struct *tsk) tsk->futex_state = FUTEX_STATE_DEAD; } -static void futex_cleanup_begin(struct task_struct *tsk) +static bool futex_cleanup_begin(struct task_struct *tsk, bool try) { /* * Prevent various race issues against a concurrent incoming waiter @@ -1055,7 +1056,12 @@ static void futex_cleanup_begin(struct task_struct *tsk) * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ - mutex_lock(&tsk->futex_exit_mutex); + if (try) { + if (!mutex_trylock(&tsk->futex_exit_mutex)) + return false; + } else { + mutex_lock(&tsk->futex_exit_mutex); + } /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. @@ -1071,6 +1077,7 @@ static void futex_cleanup_begin(struct task_struct *tsk) raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); + return true; } static void futex_cleanup_end(struct task_struct *tsk, int state) @@ -1096,7 +1103,7 @@ void futex_exec_release(struct task_struct *tsk) * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ - futex_cleanup_begin(tsk); + futex_cleanup_begin(tsk, false); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about @@ -1107,9 +1114,29 @@ void futex_exec_release(struct task_struct *tsk) void futex_exit_release(struct task_struct *tsk) { - futex_cleanup_begin(tsk); + futex_cleanup_begin(tsk, false); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); +} + +/* Try to perform the futex_cleanup and return true if successful. + * Designed to be called from the context of the OOM Reaper. + */ +bool try_futex_exit_release(struct task_struct *tsk) +{ + if (!futex_cleanup_begin(tsk, true)) + return false; + + /* We are calling this from the context of a kthread. We need to + * instruct the kthread to use the address space of the given mm + * so the get_user won't return -EFAULT. + */ + kthread_use_mm(tsk->mm); futex_cleanup(tsk); + kthread_unuse_mm(tsk->mm); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); + return true; } static int __init futex_init(void) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 832fb330376e..f7834c53d874 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> +#include <linux/futex.h> #include <asm/tlb.h> #include "internal.h" @@ -587,6 +588,18 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto out_unlock; } + /* We can't reap a process holding a robust_list; the pthread + * struct is allocated in userspace using PRIVATE | ANONYMOUS + * memory which when reaped before futex_cleanup() can leave + * the waiting process stuck. Try to perform the futex_cleanup, + * and if unsuccessful, skip the OOM reaping. + */ + if (task_has_robust_list(tsk) && !try_futex_exit_release(tsk)) { + trace_skip_task_reaping(tsk->pid); + pr_info("oom_reaper: skipping task as it contains a robust list"); + goto out_finish; + } + trace_start_task_reaping(tsk->pid); /* failed to reap part of the address space. Try again later */ -- 2.35.1