Re: continuous oom caused system deadlock

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Bisect indicated that this is the first bad commit,

commit 696d3cd5fb318c070dc757fe109e04e398138172
Author: David Rientjes <rientjes@xxxxxxxxxx>
Date:   Fri Jun 11 22:45:17 2010 +0200

    __out_of_memory() only has a single caller, so fold it into
    out_of_memory() and add a comment about locking for its call to
    oom_kill_process().
    
    Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx>
    Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
    Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
    Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index cba18c0..26ae697 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -628,41 +628,6 @@ static void clear_system_oom(void)
 	spin_unlock(&zone_scan_lock);
 }
 
-
-/*
- * Must be called with tasklist_lock held for read.
- */
-static void __out_of_memory(gfp_t gfp_mask, int order, const nodemask_t *mask)
-{
-	struct task_struct *p;
-	unsigned long points;
-
-	if (sysctl_oom_kill_allocating_task)
-		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-				"Out of memory (oom_kill_allocating_task)"))
-			return;
-retry:
-	/*
-	 * Rambo mode: Shoot down a process and hope it solves whatever
-	 * issues we may have.
-	 */
-	p = select_bad_process(&points, NULL, mask);
-
-	if (PTR_ERR(p) == -1UL)
-		return;
-
-	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!p) {
-		dump_header(NULL, gfp_mask, order, NULL);
-		read_unlock(&tasklist_lock);
-		panic("Out of memory and no killable processes...\n");
-	}
-
-	if (oom_kill_process(p, gfp_mask, order, points, NULL,
-			     "Out of memory"))
-		goto retry;
-}
-
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
@@ -678,7 +643,9 @@ retry:
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask)
 {
+	struct task_struct *p;
 	unsigned long freed = 0;
+	unsigned long points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -703,10 +670,36 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	if (zonelist)
 		constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
 	check_panic_on_oom(constraint, gfp_mask, order);
+
 	read_lock(&tasklist_lock);
-	__out_of_memory(gfp_mask, order,
+	if (sysctl_oom_kill_allocating_task) {
+		/*
+		 * oom_kill_process() needs tasklist_lock held.  If it returns
+		 * non-zero, current could not be killed so we must fallback to
+		 * the tasklist scan.
+		 */
+		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+				"Out of memory (oom_kill_allocating_task)"))
+			return;
+	}
+
+retry:
+	p = select_bad_process(&points, NULL,
 			constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
 								 NULL);
+	if (PTR_ERR(p) == -1UL)
+		return;
+
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (!p) {
+		dump_header(NULL, gfp_mask, order, NULL);
+		read_unlock(&tasklist_lock);
+		panic("Out of memory and no killable processes...\n");
+	}
+
+	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			     "Out of memory"))
+		goto retry;
 	read_unlock(&tasklist_lock);
 
 	/*

> Running this LTP test a few times for mmotm tree caused system hung
> hard,
> http://people.redhat.com/qcai/oom01.c
> 
> I tried to bisect but only found it was also present in the tree a few
> months back as well.
> 
> SysRq-W output indicated that kswapd0 might stuck,
> [  373.943002] kswapd0       R  running task        0    34      2
> 0x00000000
> [  373.943002]  ffff88022abdbc80 ffffffff8146e4ce ffff88022abdbcb0
> ffffffff81232698
> [  373.943002]  0000000000000001 ffffffff81a248f0 0000000000000000
> 0000000000000000
> [  373.943002]  ffff88022abdbcc0 ffffffff8112d59d ffff88022abdbcd0
> ffffffff8146e4ce
> [  373.943002] Call Trace:
> [  373.943002]  [<ffffffff8146e4ce>] ? _raw_spin_lock+0xe/0x10
> [  373.943002]  [<ffffffff81232698>] ? __percpu_counter_sum+0x4d/0x63
> [  373.943002]  [<ffffffff8112d59d>] ? get_nr_inodes_unused+0x15/0x23
> [  373.943002]  [<ffffffff8146e4ce>] ? _raw_spin_lock+0xe/0x10
> [  373.943002]  [<ffffffff8146ea0e>] ? common_interrupt+0xe/0x13
> [  373.943002]  [<ffffffff810e2add>] ? balance_pgdat+0x29b/0x417
> [  373.943002]  [<ffffffff810e2e83>] ? kswapd+0x22a/0x240
> [  373.943002]  [<ffffffff8106af63>] ?
> autoremove_wake_function+0x0/0x39
> [  373.943002]  [<ffffffff810e2c59>] ? kswapd+0x0/0x240
> [  373.943002]  [<ffffffff8106aaae>] ? kthread+0x82/0x8a
> [  373.943002]  [<ffffffff8100bae4>] ? kernel_thread_helper+0x4/0x10
> [  373.943002]  [<ffffffff8106aa2c>] ? kthread+0x0/0x8a
> [  373.943002]  [<ffffffff8100bae0>] ? kernel_thread_helper+0x0/0x10
> 
> full SysRq-W output:
> [  373.943002] Sched Debug Version: v0.09, 2.6.37-rc3+ #1
> [  373.943002] now at 381511.273166 msecs
> [  373.943002]   .jiffies                                 :
> 4295041238
> [  373.943002]   .sysctl_sched_latency                    : 18.000000
> [  373.943002]   .sysctl_sched_min_granularity            : 2.250000
> [  373.943002]   .sysctl_sched_wakeup_granularity         : 3.000000
> [  373.943002]   .sysctl_sched_child_runs_first           : 0
> [  373.943002]   .sysctl_sched_features                   : 31855
> [  373.943002]   .sysctl_sched_tunable_scaling            : 1
> (logaritmic)
> [  373.943002] 
> [  373.943002] cpu#0, 2826.236 MHz
> [  373.943002]   .nr_running                    : 1
> [  373.943002]   .load                          : 1024
> [  373.943002]   .nr_switches                   : 69769
> [  373.943002]   .nr_load_updates               : 115459
> [  373.943002]   .nr_uninterruptible            : 0
> [  373.943002]   .next_balance                  : 4295.041289
> [  373.943002]   .curr->pid                     : 34
> [  373.943002]   .clock                         : 373942.002254
> [  373.943002]   .cpu_load[0]                   : 1024
> [  373.943002]   .cpu_load[1]                   : 1024
> [  373.943002]   .cpu_load[2]                   : 1024
> [  373.943002]   .cpu_load[3]                   : 1024
> [  373.943002]   .cpu_load[4]                   : 1024
> [  373.943002]   .yld_count                     : 100
> [  373.943002]   .sched_switch                  : 0
> [  373.943002]   .sched_count                   : 82123
> [  373.943002]   .sched_goidle                  : 26687
> [  373.943002]   .avg_idle                      : 1000000
> [  373.943002]   .ttwu_count                    : 30804
> [  373.943002]   .ttwu_local                    : 8525
> [  373.943002]   .bkl_count                     : 0
> [  373.943002] 
> [  373.943002] cfs_rq[0]:/
> [  373.943002]   .exec_clock                    : 107322.196661
> [  373.943002]   .MIN_vruntime                  : 0.000001
> [  373.943002]   .min_vruntime                  : 55990.920524
> [  373.943002]   .max_vruntime                  : 0.000001
> [  373.943002]   .spread                        : 0.000000
> [  373.943002]   .spread0                       : 0.000000
> [  373.943002]   .nr_running                    : 1
> [  373.943002]   .load                          : 1024
> [  373.943002]   .nr_spread_over                : 9
> [  373.943002]   .shares                        : 0
> [  373.943002] 
> [  373.943002] rt_rq[0]:/
> [  373.943002]   .rt_nr_running                 : 0
> [  373.943002]   .rt_throttled                  : 0
> [  373.943002]   .rt_time                       : 0.000000
> [  373.943002]   .rt_runtime                    : 1000.000000
> [  373.943002] 
> [  373.943002] runnable tasks:
> [  373.943002]             task   PID         tree-key  switches  prio
>     exec-runtime         sum-exec        sum-sleep
> [  373.943002]
> ----------------------------------------------------------------------------------------------------------
> [  373.943002] R        kswapd0    34     55990.920524     43568   120
>     55990.920524     38575.283576    287944.752314 /
> [  373.943002] 
> [  373.943002] cpu#1, 2826.236 MHz
> [  373.943002]   .nr_running                    : 2
> [  373.943002]   .load                          : 2048
> [  373.943002]   .nr_switches                   : 80939
> [  373.943002]   .nr_load_updates               : 141862
> [  373.943002]   .nr_uninterruptible            : 1
> [  373.943002]   .next_balance                  : 4295.041423
> [  373.943002]   .curr->pid                     : 925
> [  373.943002]   .clock                         : 382530.001465
> [  373.943002]   .cpu_load[0]                   : 2048
> [  373.943002]   .cpu_load[1]                   : 1920
> [  373.943002]   .cpu_load[2]                   : 1806
> [  373.943002]   .cpu_load[3]                   : 1743
> [  373.943002]   .cpu_load[4]                   : 1716
> [  373.943002]   .yld_count                     : 127
> [  373.943002]   .sched_switch                  : 0
> [  373.943002]   .sched_count                   : 87429
> [  373.943002]   .sched_goidle                  : 29877
> [  373.943002]   .avg_idle                      : 1000000
> [  373.943002]   .ttwu_count                    : 33588
> [  373.943002]   .ttwu_local                    : 9295
> [  373.943002]   .bkl_count                     : 0
> [  373.943002] 
> [  373.943002] cfs_rq[1]:/
> [  373.943002]   .exec_clock                    : 132931.075561
> [  373.943002]   .MIN_vruntime                  : 66573.481283
> [  373.943002]   .min_vruntime                  : 66573.481283
> [  373.943002]   .max_vruntime                  : 66573.481283
> [  373.943002]   .spread                        : 0.000000
> [  373.943002]   .spread0                       : 10582.560759
> [  373.943002]   .nr_running                    : 2
> [  373.943002]   .load                          : 2048
> [  373.943002]   .nr_spread_over                : 10
> [  373.943002]   .shares                        : 0
> [  373.943002] 
> [  373.943002] rt_rq[1]:/
> [  373.943002]   .rt_nr_running                 : 0
> [  373.943002]   .rt_throttled                  : 0
> [  373.943002]   .rt_time                       : 0.000000
> [  373.943002]   .rt_runtime                    : 850.000000
> [  373.943002] 
> [  373.943002] runnable tasks:
> [  373.943002]             task   PID         tree-key  switches  prio
>     exec-runtime         sum-exec        sum-sleep
> [  373.943002]
> ----------------------------------------------------------------------------------------------------------
> [  373.943002] R        rpcbind   925     75167.155023      3118   120
>     75167.155023     33682.358086    277604.838691 /
> [  373.943002]  console-kit-dae  1328     66573.481283       716   120
>     66573.481283      2306.020280    277814.482610 /
> [  373.943002] 
> [  373.943002] cpu#2, 2826.236 MHz
> [  373.943002]   .nr_running                    : 1
> [  373.943002]   .load                          : 1024
> [  373.943002]   .nr_switches                   : 25657
> [  373.943002]   .nr_load_updates               : 133265
> [  373.943002]   .nr_uninterruptible            : 6
> [  373.943002]   .next_balance                  : 4295.041381
> [  373.943002]   .curr->pid                     : 1473
> [  373.943002]   .clock                         : 382530.001959
> [  373.943002]   .cpu_load[0]                   : 1024
> [  373.943002]   .cpu_load[1]                   : 732
> [  373.943002]   .cpu_load[2]                   : 703
> [  373.943002]   .cpu_load[3]                   : 726
> [  373.943002]   .cpu_load[4]                   : 777
> [  373.943002]   .yld_count                     : 143
> [  373.943002]   .sched_switch                  : 0
> [  373.943002]   .sched_count                   : 33466
> [  373.943002]   .sched_goidle                  : 5814
> [  373.943002]   .avg_idle                      : 1000000
> [  373.943002]   .ttwu_count                    : 9228
> [  373.943002]   .ttwu_local                    : 6942
> [  373.943002]   .bkl_count                     : 0
> [  373.943002] 
> [  373.943002] cfs_rq[2]:/
> [  373.943002]   .exec_clock                    : 125235.081389
> [  373.943002]   .MIN_vruntime                  : 0.000001
> [  373.943002]   .min_vruntime                  : 64653.378538
> [  373.943002]   .max_vruntime                  : 0.000001
> [  373.943002]   .spread                        : 0.000000
> [  373.943002]   .spread0                       : 8662.458014
> [  373.943002]   .nr_running                    : 1
> [  373.943002]   .load                          : 1024
> [  373.943002]   .nr_spread_over                : 28
> [  373.943002]   .shares                        : 0
> [  373.943002] 
> [  373.943002] rt_rq[2]:/
> [  373.943002]   .rt_nr_running                 : 0
> [  373.943002]   .rt_throttled                  : 0
> [  373.943002]   .rt_time                       : 0.000000
> [  373.943002]   .rt_runtime                    : 1000.000000
> [  373.943002] 
> [  373.943002] runnable tasks:
> [  373.943002]             task   PID         tree-key  switches  prio
>     exec-runtime         sum-exec        sum-sleep
> [  373.943002]
> ----------------------------------------------------------------------------------------------------------
> [  373.943002] R          oom01  1473     64653.378538      3405   120
>     64653.378538     44153.912865      3897.833338 /
> [  373.943002] 
> [  373.943002] cpu#3, 2826.236 MHz
> [  373.943002]   .nr_running                    : 2
> [  373.943002]   .load                          : 2048
> [  373.943002]   .nr_switches                   : 27316
> [  373.943002]   .nr_load_updates               : 137905
> [  373.943002]   .nr_uninterruptible            : 5
> [  373.943002]   .next_balance                  : 4295.041253
> [  373.943002]   .curr->pid                     : 1336
> [  373.943002]   .clock                         : 382530.002311
> [  373.943002]   .cpu_load[0]                   : 2048
> [  373.943002]   .cpu_load[1]                   : 1980
> [  373.943002]   .cpu_load[2]                   : 1820
> [  373.943002]   .cpu_load[3]                   : 1754
> [  373.943002]   .cpu_load[4]                   : 1790
> [  373.943002]   .yld_count                     : 9
> [  373.943002]   .sched_switch                  : 0
> [  373.943002]   .sched_count                   : 36031
> [  373.943002]   .sched_goidle                  : 6309
> [  373.943002]   .avg_idle                      : 1000000
> [  373.943002]   .ttwu_count                    : 9803
> [  373.943002]   .ttwu_local                    : 7501
> [  373.943002]   .bkl_count                     : 0
> [  373.943002] 
> [  373.943002] cfs_rq[3]:/
> [  373.943002]   .exec_clock                    : 131690.185382
> [  373.943002]   .MIN_vruntime                  : 72546.296158
> [  373.943002]   .min_vruntime                  : 72546.296158
> [  373.943002]   .max_vruntime                  : 72546.296158
> [  373.943002]   .spread                        : 0.000000
> [  373.943002]   .spread0                       : 16555.375634
> [  373.943002]   .nr_running                    : 2
> [  373.943002]   .load                          : 2048
> [  373.943002]   .nr_spread_over                : 4
> [  373.943002]   .shares                        : 0
> [  373.943002] 
> [  373.943002] rt_rq[3]:/
> [  373.943002]   .rt_nr_running                 : 0
> [  373.943002]   .rt_throttled                  : 0
> [  373.943002]   .rt_time                       : 0.000000
> [  373.943002]   .rt_runtime                    : 950.000000
> [  373.943002] 
> [  373.943002] runnable tasks:
> [  373.943002]             task   PID         tree-key  switches  prio
>     exec-runtime         sum-exec        sum-sleep
> [  373.943002]
> ----------------------------------------------------------------------------------------------------------
> [  373.943002]       irqbalance   908     72546.296158      5882   120
>     72546.296158     30782.122083    264942.728048 /
> [  373.943002] R           bash  1336     81138.830657       744   120
>     81138.830657     10827.322162    278614.352123 /
> [  373.943002] 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]