With exit_mmap holding mmap_write_lock during free_pgtables call, process_mrelease does not need to elevate mm->mm_users in order to prevent exit_mmap from destrying pagetables while __oom_reap_task_mm is walking the VMA tree. The change prevents process_mrelease from calling the last mmput, which can lead to waiting for IO completion in exit_aio. Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> Acked-by: Michal Hocko <mhocko@xxxxxxxx> --- changes in v5 - Removed Fixes: tag, per Michal Hocko - Added Acked-by's mm/oom_kill.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ddabefcfb5a..67780386f478 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1169,15 +1169,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1188,13 +1188,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; -- 2.34.1.173.g76aa8bc2d0-goog