It is possible to remove the special pagefault oom handler by simply oom locking all system zones and then calling directly into out_of_memory(). All populated zones must have ZONE_OOM_LOCKED set, otherwise there is a parallel oom killing in progress that will lead to eventual memory freeing so it's not necessary to needlessly kill another task. The context in which the pagefault is allocating memory is unknown to the oom killer, so this is done on a system-wide level. If a task has already been oom killed and hasn't fully exited yet, this will be a no-op since select_bad_process() recognizes tasks across the system with TIF_MEMDIE set. The special handling to determine whether a parallel memcg is currently oom is removed since we can detect future memory freeing with TIF_MEMDIE. The memcg has already reached its memory limit, so it will still need to kill a task regardless of the pagefault oom. Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> --- include/linux/memcontrol.h | 6 --- mm/memcontrol.c | 35 +--------------- mm/oom_kill.c | 97 ++++++++++++++++++++++++++------------------ 3 files changed, 58 insertions(+), 80 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void) return false; } -extern bool mem_cgroup_oom_called(struct task_struct *task); void mem_cgroup_update_file_mapped(struct page *page, int val); unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, int nid, @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline bool mem_cgroup_oom_called(struct task_struct *task) -{ - return false; -} - static inline int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -217,7 +217,6 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long last_oom_jiffies; atomic_t refcnt; unsigned int swappiness; @@ -1205,34 +1204,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return total; } -bool mem_cgroup_oom_called(struct task_struct *task) -{ - bool ret = false; - struct mem_cgroup *mem; - struct mm_struct *mm; - - rcu_read_lock(); - mm = task->mm; - if (!mm) - mm = &init_mm; - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) - ret = true; - rcu_read_unlock(); - return ret; -} - -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) -{ - mem->last_oom_jiffies = jiffies; - return 0; -} - -static void record_last_oom(struct mem_cgroup *mem) -{ - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); -} - /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. @@ -1484,10 +1455,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, continue; if (!nr_retries--) { - if (oom) { + if (oom) mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); - record_last_oom(mem_over_limit); - } goto nomem; } } @@ -2284,8 +2253,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, /* * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. * Moreover considering hierarchy, we should reclaim from the mem_over_limit, * not from the memcg which this page would be charged to. * try_charge_swapin does all of these works properly. diff --git a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -580,6 +580,44 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) } /* + * Try to acquire the oom killer lock for all system zones. Returns zero if a + * parallel oom killing is taking place, otherwise locks all zones and returns + * non-zero. + */ +static int try_set_system_oom(void) +{ + struct zone *zone; + int ret = 1; + + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + if (zone_is_oom_locked(zone)) { + ret = 0; + goto out; + } + for_each_populated_zone(zone) + zone_set_flag(zone, ZONE_OOM_LOCKED); +out: + spin_unlock(&zone_scan_lock); + return ret; +} + +/* + * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation + * attempts or page faults may now recall the oom killer, if necessary. + */ +static void clear_system_oom(void) +{ + struct zone *zone; + + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + zone_clear_flag(zone, ZONE_OOM_LOCKED); + spin_unlock(&zone_scan_lock); +} + + +/* * Must be called with tasklist_lock held for read. */ static void __out_of_memory(gfp_t gfp_mask, int order, @@ -614,46 +652,9 @@ retry: goto retry; } -/* - * pagefault handler calls into here because it is out of memory but - * doesn't know exactly how or why. - */ -void pagefault_out_of_memory(void) -{ - unsigned long freed = 0; - - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return; - - /* - * If this is from memcg, oom-killer is already invoked. - * and not worth to go system-wide-oom. - */ - if (mem_cgroup_oom_called(current)) - goto rest_and_return; - - if (sysctl_panic_on_oom) - panic("out of memory from page fault. panic_on_oom is selected.\n"); - - read_lock(&tasklist_lock); - /* unknown gfp_mask and order */ - __out_of_memory(0, 0, CONSTRAINT_NONE, NULL); - read_unlock(&tasklist_lock); - - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory. - */ -rest_and_return: - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); -} - /** * out_of_memory - kill the "best" process when we run out of memory - * @zonelist: zonelist pointer + * @zonelist: zonelist pointer passed to page allocator * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 * @nodemask: nodemask passed to page allocator @@ -667,7 +668,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask) { unsigned long freed = 0; - enum oom_constraint constraint; + enum oom_constraint constraint = CONSTRAINT_NONE; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) @@ -683,7 +684,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask, nodemask); + if (zonelist) + constraint = constrained_alloc(zonelist, gfp_mask, nodemask); read_lock(&tasklist_lock); if (unlikely(sysctl_panic_on_oom)) { /* @@ -693,6 +695,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (constraint == CONSTRAINT_NONE) { dump_header(NULL, gfp_mask, order, NULL); + read_unlock(&tasklist_lock); panic("Out of memory: panic_on_oom is enabled\n"); } } @@ -706,3 +709,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, if (!test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); } + +/* + * The pagefault handler calls here because it is out of memory, so kill a + * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel + * oom killing is already in progress so do nothing. If a task is found with + * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + */ +void pagefault_out_of_memory(void) +{ + if (!try_set_system_oom()) + return; + out_of_memory(NULL, 0, 0, NULL); + clear_system_oom(); +} -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>