I found that it is not difficult to hit "oom_reaper: unable to reap pid:" messages if the victim thread is doing copy_process(). If we check where the OOM victims are stuck, we can find that they are waiting at i_mmap_lock_write() in dup_mmap(). ---------------------------------------- [ 239.804758] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 239.808597] Killed process 31088 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB [ 239.929470] oom_reaper: reaped process 31088 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 240.520586] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 240.524264] Killed process 29307 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB [ 241.568323] oom_reaper: reaped process 29307 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 242.228607] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 242.232281] Killed process 29323 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 242.902598] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 242.906366] Killed process 31097 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB [ 243.240908] oom_reaper: reaped process 31097 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 243.854813] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 243.858490] Killed process 31100 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB [ 244.120162] oom_reaper: reaped process 31100 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 244.750778] Out of memory: Kill process 28909 (a.out) score 0 or sacrifice child [ 244.754505] Killed process 31106 (a.out) total-vm:4176kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB [ 245.815781] oom_reaper: unable to reap pid:31106 (a.out) [ 245.818786] [ 245.818786] Showing all locks held in the system: (...snipped...) [ 245.869500] 1 lock held by kswapd0/79: [ 245.872655] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 246.565465] 1 lock held by a.out/29307: /* Already reaped OOM victim */ [ 246.568926] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: unlink_file_vma+0x28/0x50 (...snipped...) [ 250.812088] 3 locks held by a.out/30940: [ 250.815543] #0: 00000000cd61a8e0 (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0 [ 250.820980] #1: 00000000cf6d4f24 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0 [ 250.826401] #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 251.233604] 1 lock held by a.out/31088: /* Already reaped OOM victim */ [ 251.236953] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: unlink_file_vma+0x28/0x50 (...snipped...) [ 251.258144] 1 lock held by a.out/31097: /* Already reaped OOM victim */ [ 251.261531] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: unlink_file_vma+0x28/0x50 [ 251.266789] 1 lock held by a.out/31100: /* Already reaped OOM victim */ [ 251.270208] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: unlink_file_vma+0x28/0x50 (...snipped...) [ 251.305475] 3 locks held by a.out/31106: /* Unable to reap OOM victim */ [ 251.308949] #0: 00000000b0f753ba (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0 [ 251.314283] #1: 00000000ef64d539 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0 [ 251.319618] #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: copy_process.part.41+0x12f2/0x1fe0 (...snipped...) [ 259.196415] 1 lock held by a.out/33338: [ 259.199837] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: unlink_file_vma+0x28/0x50 (...snipped...) [ 264.040902] 3 locks held by a.out/34558: [ 264.044475] #0: 00000000348405b9 (&mm->mmap_sem){++++}, at: __do_page_fault+0x16f/0x4d0 [ 264.049516] #1: 00000000962671a1 (&(&ip->i_mmaplock)->mr_lock){++++}, at: xfs_ilock+0x10a/0x190 [xfs] [ 264.055108] #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 267.747036] 3 locks held by a.out/35518: [ 267.750545] #0: 0000000098a5825d (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0 [ 267.755955] #1: 000000002b63c006 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0 [ 267.761466] #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 295.198803] 1 lock held by a.out/42524: [ 295.202483] #0: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 295.599000] 2 locks held by a.out/42684: [ 295.602901] #0: 000000003cd42787 (&mm->mmap_sem){++++}, at: __do_page_fault+0x457/0x4d0 [ 295.608495] #1: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 296.202611] 2 locks held by a.out/42885: [ 296.206546] #0: 0000000065124e3d (&mm->mmap_sem){++++}, at: __do_page_fault+0x16f/0x4d0 [ 296.212185] #1: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 300.594196] 2 locks held by a.out/44035: [ 300.599942] #0: 00000000a4e2de40 (&mm->mmap_sem){++++}, at: __do_page_fault+0x457/0x4d0 [ 300.605533] #1: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: rmap_walk_file+0x205/0x310 (...snipped...) [ 302.278287] 3 locks held by a.out/44420: [ 302.282104] #0: 00000000f043328f (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0 [ 302.287959] #1: 000000007f312097 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0 [ 302.293872] #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: copy_process.part.41+0x12f2/0x1fe0 ---------------------------------------- In dup_mmap(), we are using down_write_killable() for mm->mmap_sem but we are not using down_write_killable() for mapping->i_mmap_rwsem. And what is unfortunate is that processes accessing mapping->i_mmap_rwsem is more wider than processes accessing the OOM victim's mm->mmap_sem. If the OOM victim is holding mm->mmap_sem held for write, and if the OOM victim can interrupt operations which need mm->mmap_sem held for write, we can downgrade mm->mmap_sem upon SIGKILL and the OOM reaper will be able to reap the OOM victim's memory. Therefore, this patch introduces i_mmap_lock_write_killable() and downgrade upon SIGKILL. Since mm->mmap_sem is still held for read, nobody can acquire mm->mmap_sem for write. Thus, the only thing we need to be careful is that whether we can safely interrupt. (But I'm not familiar with mmap. Thus, please review carefully...) Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> --- include/linux/fs.h | 5 +++++ kernel/fork.c | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index bb45c48..2f11c55 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -468,6 +468,11 @@ static inline void i_mmap_lock_write(struct address_space *mapping) down_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_lock_write_killable(struct address_space *mapping) +{ + return down_write_killable(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); diff --git a/kernel/fork.c b/kernel/fork.c index 1e8c9a7..b4384e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,6 +400,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge; LIST_HEAD(uf); + bool downgraded = false; uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -476,7 +477,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); + if (i_mmap_lock_write_killable(mapping)) { + downgrade_write(&oldmm->mmap_sem); + downgraded = true; + i_mmap_lock_write(mapping); + } if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); @@ -508,7 +513,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) + if (downgraded) + retval = -EINTR; + else if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) @@ -523,7 +530,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); + if (!downgraded) + up_write(&oldmm->mmap_sem); + else + up_read(&oldmm->mmap_sem); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); -- 1.8.3.1