Re: [REPOST] [PATCH 2/2] mm,oom: Reverse the order of setting TIF_MEMDIE and sending SIGKILL.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Michal Hocko wrote:
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > index 5249e7e..c0a5a69 100644
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -555,12 +555,17 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> >  	/* Get a reference to safely compare mm after task_unlock(victim) */
> >  	mm = victim->mm;
> >  	atomic_inc(&mm->mm_users);
> > -	mark_oom_victim(victim);
> >  	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
> >  		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
> >  		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
> >  		K(get_mm_counter(victim->mm, MM_FILEPAGES)));
> >  	task_unlock(victim);
> > +	/* Send SIGKILL before setting TIF_MEMDIE. */
> > +	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
> > +	task_lock(victim);
> > +	if (victim->mm)
> > +		mark_oom_victim(victim);
> > +	task_unlock(victim);
> 
> Why cannot you simply move do_send_sig_info without touching
> mark_oom_victim? Are you still able to trigger the issue if you just
> kill before crawling through all the tasks sharing the mm?

If you meant

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ecc0bc..ea578fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -560,6 +560,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
                K(get_mm_counter(victim->mm, MM_FILEPAGES)));
        task_unlock(victim);
+       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);

        /*
         * Kill all user processes sharing victim->mm in other thread groups, if
@@ -585,7 +586,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                }
        rcu_read_unlock();

-       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
        put_task_struct(victim);
 }
 #undef K

then yes I still can trigger the issue under very limited condition (i.e.
ran as root user for polling kernel messages with realtime priority, after
killing all processes using SysRq-i).

---------- oom-depleter2.c start ----------
#define _GNU_SOURCE
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/klog.h>

static int zero_fd = -1;
static char *buf = NULL;
static unsigned long size = 0;

static int trigger(void *unused)
{
        {
                struct sched_param sp = { };
                sched_setscheduler(0, SCHED_IDLE, &sp);
        }
        read(zero_fd, buf, size); /* Will cause OOM due to overcommit */
        return 0;
}

int main(int argc, char *argv[])
{
        unsigned long i;
        zero_fd = open("/dev/zero", O_RDONLY);
        for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) {
                char *cp = realloc(buf, size);
                if (!cp) {
                        size >>= 1;
                        break;
                }
                buf = cp;
        }
        /* Let a child thread trigger the OOM killer. */
        clone(trigger, malloc(4096) + 4096, CLONE_SIGHAND | CLONE_VM, NULL);
        {
                struct sched_param sp = { 99 };
                sched_setscheduler(0, SCHED_FIFO, &sp);
        }
        /* Wait until the OOM killer messages appear. */
        while (1) {
                i = klogctl(2, buf, size - 1);
                if (i > 0) {
                        buf[i] = '\0';
                        if (strstr(buf, "Killed process "))
                                break;
                }
        }
        /* Deplete all memory reserve. */
        for (i = size; i; i -= 4096)
                buf[i - 1] = 1;
        return * (char *) NULL; /* Kill all threads. */
}
---------- oom-depleter2.c start ----------

# taskset -c 0 ./oom-depleter2

(Intentionally running two threads with different priority on the same CPU
 in order to increase possibility of invoking preemption.)

---------- console log start ----------
[   47.069197] oom-depleter2 invoked oom-killer: gfp_mask=0x280da, order=0, oom_score_adj=0
[   47.070651] oom-depleter2 cpuset=/ mems_allowed=0
[   47.072982] CPU: 0 PID: 3851 Comm: oom-depleter2 Tainted: G        W       4.2.0-rc7-next-20150824+ #85
[   47.074683] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013
[   47.076583]  0000000000000000 00000000115c5c6c ffff88007ca2f8c8 ffffffff81313283
[   47.078014]  ffff88007890f2c0 ffff88007ca2f970 ffffffff8117ff7d 0000000000000000
[   47.079438]  0000000000000202 0000000000000018 0000000000000001 0000000000000202
[   47.080856] Call Trace:
[   47.081335]  [<ffffffff81313283>] dump_stack+0x4b/0x78
[   47.082233]  [<ffffffff8117ff7d>] dump_header+0x82/0x232
[   47.083234]  [<ffffffff81627645>] ? _raw_spin_unlock_irqrestore+0x25/0x30
[   47.084447]  [<ffffffff810fe041>] ? delayacct_end+0x51/0x60
[   47.085483]  [<ffffffff81114fd2>] oom_kill_process+0x372/0x3c0
[   47.086551]  [<ffffffff81071cd0>] ? has_ns_capability_noaudit+0x30/0x40
[   47.087715]  [<ffffffff81071cf2>] ? has_capability_noaudit+0x12/0x20
[   47.088874]  [<ffffffff8111528d>] out_of_memory+0x21d/0x4a0
[   47.089915]  [<ffffffff8111a774>] __alloc_pages_nodemask+0x904/0x930
[   47.091010]  [<ffffffff8115d080>] alloc_pages_vma+0xb0/0x1f0
[   47.092042]  [<ffffffff8113df77>] handle_mm_fault+0x13a7/0x1950
[   47.093076]  [<ffffffff816287cd>] ? retint_kernel+0x1b/0x1d
[   47.094108]  [<ffffffff81628837>] ? native_iret+0x7/0x7
[   47.095108]  [<ffffffff810565bb>] __do_page_fault+0x18b/0x440
[   47.096109]  [<ffffffff810568a0>] do_page_fault+0x30/0x80
[   47.097052]  [<ffffffff816297e8>] page_fault+0x28/0x30
[   47.098544]  [<ffffffff81320ae0>] ? __clear_user+0x20/0x50
[   47.099651]  [<ffffffff813254b8>] iov_iter_zero+0x68/0x250
[   47.100642]  [<ffffffff810920f6>] ? sched_clock_cpu+0x86/0xc0
[   47.101701]  [<ffffffff813f9018>] read_iter_zero+0x38/0xa0
[   47.102754]  [<ffffffff81183ec4>] __vfs_read+0xc4/0xf0
[   47.103684]  [<ffffffff81184639>] vfs_read+0x79/0x120
[   47.104630]  [<ffffffff81185350>] SyS_read+0x50/0xc0
[   47.105503]  [<ffffffff8108bd9c>] ? do_sched_setscheduler+0x7c/0xb0
[   47.106637]  [<ffffffff81627cae>] entry_SYSCALL_64_fastpath+0x12/0x71
[   47.109307] Mem-Info:
[   47.109801] active_anon:416244 inactive_anon:3737 isolated_anon:0
[   47.109801]  active_file:0 inactive_file:474 isolated_file:0
[   47.109801]  unevictable:0 dirty:0 writeback:0 unstable:0
[   47.109801]  slab_reclaimable:1114 slab_unreclaimable:3896
[   47.109801]  mapped:96 shmem:4188 pagetables:1014 bounce:0
[   47.109801]  free:12368 free_pcp:183 free_cma:0
[   47.118364] Node 0 DMA free:7316kB min:400kB low:500kB high:600kB active_anon:7056kB inactive_anon:232kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15988kB managed:15904kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:296kB slab_reclaimable:52kB slab_unreclaimable:216kB kernel_stack:16kB pagetables:308kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:28 all_unreclaimable? yes
[   47.129538] lowmem_reserve[]: 0 1731 1731 1731
[   47.131230] Node 0 DMA32 free:44016kB min:44652kB low:55812kB high:66976kB active_anon:1657920kB inactive_anon:14716kB active_file:0kB inactive_file:32kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:1774256kB mlocked:0kB dirty:0kB writeback:0kB mapped:384kB shmem:16456kB slab_reclaimable:4404kB slab_unreclaimable:15368kB kernel_stack:3264kB pagetables:3748kB unstable:0kB bounce:0kB free_pcp:796kB local_pcp:56kB free_cma:0kB writeback_tmp:0kB pages_scanned:124 all_unreclaimable? no
[   47.143246] lowmem_reserve[]: 0 0 0 0
[   47.145175] Node 0 DMA: 17*4kB (UE) 9*8kB (UE) 9*16kB (UEM) 1*32kB (M) 1*64kB (M) 2*128kB (UE) 2*256kB (EM) 2*512kB (EM) 1*1024kB (E) 2*2048kB (EM) 0*4096kB = 7292kB
[   47.152896] Node 0 DMA32: 1009*4kB (UEM) 617*8kB (UEM) 268*16kB (UEM) 118*32kB (UEM) 43*64kB (UEM) 13*128kB (UEM) 11*256kB (UEM) 10*512kB (UM) 12*1024kB (UM) 1*2048kB (U) 0*4096kB = 43724kB
[   47.161214] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[   47.163987] 4649 total pagecache pages
[   47.166121] 0 pages in swap cache
[   47.168500] Swap cache stats: add 0, delete 0, find 0/0
[   47.170238] Free swap  = 0kB
[   47.171764] Total swap = 0kB
[   47.173270] 524157 pages RAM
[   47.174520] 0 pages HighMem/MovableOnly
[   47.175930] 76617 pages reserved
[   47.178043] 0 pages hwpoisoned
[   47.179584] [ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name
[   47.182065] [ 3820]     0  3820    10756      168      24       3        0             0 systemd-journal
[   47.184504] [ 3823]     0  3823    10262      101      23       3        0         -1000 systemd-udevd
[   47.186847] [ 3824]     0  3824    27503       33      12       3        0             0 agetty
[   47.189291] [ 3825]     0  3825     8673       84      23       3        0             0 systemd-logind
[   47.191691] [ 3826]     0  3826    21787      154      48       3        0             0 login
[   47.193959] [ 3828]    81  3828     6609       82      18       3        0          -900 dbus-daemon
[   47.196297] [ 3831]     0  3831    28878       93      15       3        0             0 bash
[   47.198573] [ 3850]     0  3850   541715   414661     820       6        0             0 oom-depleter2
[   47.200915] [ 3851]     0  3851   541715   414661     820       6        0             0 oom-depleter2
[   47.203410] Out of memory: Kill process 3850 (oom-depleter2) score 900 or sacrifice child
[   47.205695] Killed process 3850 (oom-depleter2) total-vm:2166860kB, anon-rss:1658644kB, file-rss:0kB
[   47.257871] oom-depleter2: page allocation failure: order:0, mode:0x280da
[   47.260006] CPU: 0 PID: 3850 Comm: oom-depleter2 Tainted: G        W       4.2.0-rc7-next-20150824+ #85
[   47.262473] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013
[   47.265184]  0000000000000000 000000000f39672f ffff880036febbe0 ffffffff81313283
[   47.267511]  00000000000280da ffff880036febc70 ffffffff81116e04 0000000000000000
[   47.269815]  ffffffff00000000 ffff88007fc19730 ffff880000000004 ffffffff810a30cf
[   47.272019] Call Trace:
[   47.273283]  [<ffffffff81313283>] dump_stack+0x4b/0x78
[   47.275081]  [<ffffffff81116e04>] warn_alloc_failed+0xf4/0x150
[   47.276962]  [<ffffffff810a30cf>] ? __wake_up+0x3f/0x50
[   47.278700]  [<ffffffff8111a0bc>] __alloc_pages_nodemask+0x24c/0x930
[   47.280664]  [<ffffffff8115d080>] alloc_pages_vma+0xb0/0x1f0
[   47.282422]  [<ffffffff8113df77>] handle_mm_fault+0x13a7/0x1950
[   47.284240]  [<ffffffff810565bb>] __do_page_fault+0x18b/0x440
[   47.286036]  [<ffffffff810568a0>] do_page_fault+0x30/0x80
[   47.287693]  [<ffffffff816297e8>] page_fault+0x28/0x30
[   47.289358] Mem-Info:
[   47.290494] active_anon:429031 inactive_anon:3737 isolated_anon:0
[   47.290494]  active_file:0 inactive_file:0 isolated_file:0
[   47.290494]  unevictable:0 dirty:0 writeback:0 unstable:0
[   47.290494]  slab_reclaimable:1114 slab_unreclaimable:3896
[   47.290494]  mapped:96 shmem:4188 pagetables:1014 bounce:0
[   47.290494]  free:0 free_pcp:180 free_cma:0
[   47.299662] Node 0 DMA free:8kB min:400kB low:500kB high:600kB active_anon:14308kB inactive_anon:232kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15988kB managed:15904kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:296kB slab_reclaimable:52kB slab_unreclaimable:216kB kernel_stack:16kB pagetables:308kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:28 all_unreclaimable? yes
[   47.309430] lowmem_reserve[]: 0 1731 1731 1731
[   47.311000] Node 0 DMA32 free:0kB min:44652kB low:55812kB high:66976kB active_anon:1701816kB inactive_anon:14716kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:1774256kB mlocked:0kB dirty:0kB writeback:0kB mapped:384kB shmem:16456kB slab_reclaimable:4404kB slab_unreclaimable:15368kB kernel_stack:3264kB pagetables:3748kB unstable:0kB bounce:0kB free_pcp:720kB local_pcp:24kB free_cma:0kB writeback_tmp:0kB pages_scanned:5584 all_unreclaimable? yes
[   47.321601] lowmem_reserve[]: 0 0 0 0
[   47.323166] Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
[   47.326070] Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
[   47.329018] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[   47.331385] 4189 total pagecache pages
[   47.332896] 0 pages in swap cache
[   47.334262] Swap cache stats: add 0, delete 0, find 0/0
[   47.335990] Free swap  = 0kB
[   47.337390] Total swap = 0kB
[   47.338656] 524157 pages RAM
[   47.339964] 0 pages HighMem/MovableOnly
[   47.341464] 76617 pages reserved
[   47.342808] 0 pages hwpoisoned
(...snipped...)
[   93.082032] Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
[   93.082034] Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
---------- console log end ----------
Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20150825.txt.xz .

> 
> The code would be easier then and the race window much smaller. If we
> really needed to prevent from preemption then preempt_{enable,disable}
> aournd the whole task_lock region + do_send_sig_info would be still
> easier to follow than re-taking task_lock.

What's wrong with re-taking task_lock? It seems to me that re-taking
task_lock is more straightforward and easier to follow.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]