Re: mm: Can we bail out p?d_alloc() loops upon SIGKILL?

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Fri, 1 Mar 2019 19:30:54 +0900

On 2019/02/28 18:26, Michal Hocko wrote:
> We cannot do anything about the preemption so that is moot. ALLOC_OOM
> reserve is limited so the failure should happen sooner or later. But

The problem is that preemption can slowdown ALLOC_OOM allocations (at e.g.
cond_resched() from direct reclaim path). Since concurrently allocating
threads can consume CPU time, the OOM reaper can fail to wait for the OOM
victim to complete (or fail) ALLOC_OOM allocations.

> I would be OK to check for fatal_signal_pending once per pmd or so if
> that helps and it doesn't add a noticeable overhead.

Another option is to scatter __GFP_NOMEMALLOC to allocations which might
be used from fork() path.

> 
>> Technically, it would be possible to use a per task_struct flag
>> which allows __alloc_pages_nodemask() to check early and bail out:
>>
>>   down_write(&current->mm->mmap_sem);
>>   current->no_oom_alloc = 1;
>>   while (...) {
>>       p?d_alloc();
>>   }
>>   current->no_oom_alloc = 0;
>>   up_write(&current->mm->mmap_sem);
> 
> Looks like a hack to me. We already do have __GFP_NOMEMALLOC,
> __GFP_MEMALLOC and PF_MEMALLOC and you want yet another way to control
> access to reserves. This is a mess.

The intention is to fail the allocation as quick as possible rather than
avoid consumption of memory reserves. Since the OOM reaper gives up after
just one second, being able to quickly exit the allocation loop and release
mmap_sem held for write is important for allowing the OOM reaper to reclaim
memory from the OOM victim. (I wish __GFP_KILLABLE were there...)

>                                     If anything then PF_NOMEMALLOC would
> be a better fit but the flag space is quite tight already. Besides that
> is this really worth doing when the caller can bail out?

Scattering __GFP_NOMEMALLOC (like draft patch shown below) reduces frequency of
failing to reclaim memory from the OOM victim. Though it cannot become perfect
because the OOM victim might be still blocked at e.g. down_write() or
cond_resched() in __alloc_pages_nodemask(), callers using GFP_KERNEL_ACCOUNT
allocations could afford __GFP_NOMEMALLOC ?

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index a281e61..fef88fb 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -102,7 +102,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *page;
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOMEMALLOC;
 
 	if (mm == &init_mm)
 		gfp &= ~__GFP_ACCOUNT;
@@ -162,7 +162,7 @@ static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pu
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOMEMALLOC;
 
 	if (mm == &init_mm)
 		gfp &= ~__GFP_ACCOUNT;
@@ -202,7 +202,7 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4
 
 static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOMEMALLOC;
 
 	if (mm == &init_mm)
 		gfp &= ~__GFP_ACCOUNT;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7bd0170..2a36287 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -21,7 +21,7 @@
 #define PGALLOC_USER_GFP 0
 #endif
 
-gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP | __GFP_NOMEMALLOC;
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e..57f0b54 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -338,7 +338,7 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
-	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL | __GFP_NOMEMALLOC);
 
 	if (new) {
 		*new = *orig;
diff --git a/mm/memory.c b/mm/memory.c
index e11ca9d..0f27d67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4574,7 +4574,7 @@ bool ptlock_alloc(struct page *page)
 {
 	spinlock_t *ptl;
 
-	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
+	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL | __GFP_NOMEMALLOC);
 	if (!ptl)
 		return false;
 	page->ptl = ptl;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 26ea863..d81b0f8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/sched/debug.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -610,6 +611,7 @@ static void oom_reap_task(struct task_struct *tsk)
 
 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 		task_pid_nr(tsk), tsk->comm);
+	sched_show_task(tsk);
 	debug_show_all_locks();
 
 done:
diff --git a/mm/rmap.c b/mm/rmap.c
index 0454ecc2..332743c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -270,7 +270,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 		if (unlikely(!avc)) {
 			unlock_anon_vma_root(root);
 			root = NULL;
-			avc = anon_vma_chain_alloc(GFP_KERNEL);
+			avc = anon_vma_chain_alloc(GFP_KERNEL | __GFP_NOMEMALLOC);
 			if (!avc)
 				goto enomem_failure;
 		}
@@ -341,7 +341,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	anon_vma = anon_vma_alloc();
 	if (!anon_vma)
 		goto out_error;
-	avc = anon_vma_chain_alloc(GFP_KERNEL);
+	avc = anon_vma_chain_alloc(GFP_KERNEL | __GFP_NOMEMALLOC);
 	if (!avc)
 		goto out_error_free_anon_vma;