Re: [PATCH] oom_reaper: close race without using oom_lock

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Tetsuo Handa wrote:
> Log is at http://I-love.SAKURA.ne.jp/tmp/serial-20170722.txt.xz .

Oops, I forgot to remove mmput_async() in Patch2. Below is updated result.
Though, situation (i.e. we can't tell without Patch1 whether we raced with
OOM_MMF_SKIP) is same.

Patch1:
----------------------------------------
 include/linux/oom.h |  4 ++++
 mm/internal.h       |  4 ++++
 mm/oom_kill.c       | 28 +++++++++++++++++++++++++++-
 mm/page_alloc.c     | 10 +++++++---
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2..1b0bbb6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -11,6 +11,7 @@
 struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
+struct alloc_context;
 
 /*
  * Details of the page allocation that triggered the oom killer that are used to
@@ -39,6 +40,9 @@ struct oom_control {
 	unsigned long totalpages;
 	struct task_struct *chosen;
 	unsigned long chosen_points;
+
+	const struct alloc_context *alloc_context;
+	unsigned int alloc_flags;
 };
 
 extern struct mutex oom_lock;
diff --git a/mm/internal.h b/mm/internal.h
index 24d88f0..95a08b5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -522,4 +522,8 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
 }
 
+struct page *get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+				    int alloc_flags,
+				    const struct alloc_context *ac);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f0..fb7b2c8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,6 +288,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
+static unsigned int mmf_oom_skip_raced;
+static unsigned int mmf_oom_skip_not_raced;
+
 static int oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
@@ -303,8 +306,21 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	 * any memory is quite low.
 	 */
 	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+			const struct alloc_context *ac = oc->alloc_context;
+
+			if (ac) {
+				struct page *page = get_page_from_freelist
+					(oc->gfp_mask, oc->order,
+					 oc->alloc_flags, ac);
+				if (page) {
+					__free_pages(page, oc->order);
+					mmf_oom_skip_raced++;
+				} else
+					mmf_oom_skip_not_raced++;
+			}
 			goto next;
+		}
 		goto abort;
 	}
 
@@ -1059,6 +1075,16 @@ bool out_of_memory(struct oom_control *oc)
 		 */
 		schedule_timeout_killable(1);
 	}
+	{
+		static unsigned long last;
+		unsigned long now = jiffies;
+
+		if (!last || time_after(now, last + 5 * HZ)) {
+			last = now;
+			pr_info("MMF_OOM_SKIP: raced=%u not_raced=%u\n",
+				mmf_oom_skip_raced, mmf_oom_skip_not_raced);
+		}
+	}
 	return !!oc->chosen;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80e4adb..4cf2861 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3054,7 +3054,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
-static struct page *
+struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
@@ -3245,7 +3245,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-	const struct alloc_context *ac, unsigned long *did_some_progress)
+		      unsigned int alloc_flags, const struct alloc_context *ac,
+		      unsigned long *did_some_progress)
 {
 	struct oom_control oc = {
 		.zonelist = ac->zonelist,
@@ -3253,6 +3254,8 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.alloc_context = ac,
+		.alloc_flags = alloc_flags,
 	};
 	struct page *page;
 
@@ -3955,7 +3958,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 		goto retry_cpuset;
 
 	/* Reclaim has failed us, start killing things */
-	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+	page = __alloc_pages_may_oom(gfp_mask, order, alloc_flags, ac,
+				     &did_some_progress);
 	if (page)
 		goto got_pg;
 
----------------------------------------

Patch2:
----------------------------------------
 mm/mmap.c     |  7 +++++++
 mm/oom_kill.c | 41 +++++------------------------------------
 2 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf..669f07d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2993,6 +2993,11 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
+	/*
+	 * oom reaper might race with exit_mmap so make sure we won't free
+	 * page tables or unmap VMAs under its feet
+	 */
+	down_write(&mm->mmap_sem);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 
@@ -3005,7 +3010,9 @@ void exit_mmap(struct mm_struct *mm)
 			nr_accounted += vma_pages(vma);
 		vma = remove_vma(vma);
 	}
+	mm->mmap = NULL;
 	vm_unacct_memory(nr_accounted);
+	up_write(&mm->mmap_sem);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fb7b2c8..ed88355 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -486,39 +486,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	bool ret = true;
-
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * __oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
-	mutex_lock(&oom_lock);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
-		ret = false;
 		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
+		return false;
 	}
 
-	/*
-	 * increase mm_users only after we know we will reap something so
-	 * that the mmput_async is called only when we have reaped something
-	 * and delayed __mmput doesn't matter that much
-	 */
-	if (!mmget_not_zero(mm)) {
+	/* There is nothing to reap so bail out without signs in the log */
+	if (!mm->mmap) {
 		up_read(&mm->mmap_sem);
-		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
+		return true;
 	}
 
 	trace_start_task_reaping(tsk->pid);
@@ -558,16 +535,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 			K(get_mm_counter(mm, MM_SHMEMPAGES)));
 	up_read(&mm->mmap_sem);
 
-	/*
-	 * Drop our reference but make sure the mmput slow path is called from a
-	 * different context because we shouldn't risk we get stuck there and
-	 * put the oom_reaper out of the way.
-	 */
-	mmput_async(mm);
 	trace_finish_task_reaping(tsk->pid);
-unlock_oom:
-	mutex_unlock(&oom_lock);
-	return ret;
+	return true;
 }
 
 #define MAX_OOM_REAP_RETRIES 10
----------------------------------------

Patch3:
----------------------------------------
 mm/oom_kill.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed88355..59737bf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -306,7 +306,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	 * any memory is quite low.
 	 */
 	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+		if (task->signal->oom_mm->async_put_work.func) {
 			const struct alloc_context *ac = oc->alloc_context;
 
 			if (ac) {
@@ -321,6 +321,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 			}
 			goto next;
 		}
+		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+			task->signal->oom_mm->async_put_work.func = (void *) 1;
 		goto abort;
 	}
 
@@ -646,8 +648,10 @@ static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
+		tsk->signal->oom_mm->async_put_work.func = NULL;
+	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
----------------------------------------

Patch4:
----------------------------------------
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4cf2861..3e0e7da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3265,7 +3265,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	 * Acquire the oom lock.  If that fails, somebody else is
 	 * making progress for us.
 	 */
-	if (!mutex_trylock(&oom_lock)) {
+	if (mutex_lock_killable(&oom_lock)) {
 		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
----------------------------------------

Log is at http://I-love.SAKURA.ne.jp/tmp/serial-20170723.txt.xz .

# grep MMF_OOM_SKIP serial-20170723.txt | sed -e 's/=/ /g' | awk ' { if ($5 + $7) printf("%10u %10u %10f\n", $5, $7, ($5*100/($5+$7))); else printf("-----\n"); }'

----------------------------------------
----- # Patch1
        42      72416   0.057965
       684     100569   0.675536
      1169     103432   1.117580
      1169     103843   1.113206
      1169     254979   0.456377
      1169     260675   0.446449
      1449     268899   0.535976
      1449     268905   0.535964
      1449     268927   0.535920
      1449     268965   0.535845
      1449     268990   0.535796
      1449     269089   0.535599
      1469     269307   0.542515
      1469     270651   0.539835
      1545     272860   0.563036
      1738     275991   0.625790
      1738     276321   0.625047
      1738     277121   0.623254
      1861     282203   0.655134
      2214     289569   0.758783
      2590     302229   0.849685
      3036     315279   0.953772
----- # Patch1 + Patch2
         0         21   0.000000
         0         45   0.000000
        12         79  13.186813
        12        159   7.017544
        12       2270   0.525855
        12       4750   0.251995
       178      15222   1.155844
       178      16997   1.036390
       178      19847   0.888889
       178      20645   0.854824
       178      23135   0.763522
       178      30479   0.580618
       178      32475   0.545126
       178      35060   0.505137
       178      36122   0.490358
       178      44854   0.395274
       178      49726   0.356685
       178      51619   0.343649
       178      57369   0.309312
       506      61344   0.818108
       506      63039   0.796286
       506      69691   0.720829
       506      83565   0.601872
       506      86330   0.582708
      1358     102218   1.311115
      1358     106653   1.257279
      1358     108003   1.241759
      1358     113901   1.178216
      1358     115739   1.159722
      1358     115739   1.159722
      1358     225671   0.598161
      1680     253286   0.658911
      9368     760763   1.216416
      9368     760852   1.216276
      9368     761841   1.214716
      9368     765167   1.209500
      9381     770368   1.203079
      9381     773975   1.197540
      9816     786044   1.233383
      9875     808291   1.206968
      9875     840890   1.160720
     10770     854555   1.244619
     10794     857956   1.242475
     10794     866148   1.230868
     11161     869111   1.267904
     11226     941179   1.178700
     11697     945889   1.221509
     12222     980317   1.231387
     12948    1038330   1.231644
     13157    1054693   1.232102
     14412    1077659   1.319694
     14953    1097134   1.344589
     15466    1252732   1.219526
----- # Patch1 + Patch2 + Patch3
         0          2   0.000000
         2         75   2.597403
        46        995   4.418828
       175       5416   3.130030
       358      15725   2.225953
       736      28838   2.488672
       736      36445   1.979506
      1008      63860   1.553925
      1008      75472   1.317992
      1008      78268   1.271507
      1408      95598   1.451457
      2142     141059   1.495800
      2537     215187   1.165237
      3123     222191   1.386066
      3478     318033   1.081767
      3618     505315   0.710899
      4768     615277   0.768976
      5939     825753   0.714086
      5939     926402   0.636999
      6969    1088325   0.636268
      7852    1361918   0.573235
----- # Patch1 + Patch2 + Patch3 + Patch4
         0         25   0.000000
         0        959   0.000000
        55       3868   1.401988
      3514      10387  25.278757
      5532      38260  12.632444
      7325      44891  14.028267
      7325      45320  13.913952
      7325      45320  13.913952
      7327      45322  13.916694
      8202      48418  14.486047
     11548      71310  13.937097
     14330      96425  12.938468
     14793     126763  10.450281
     14793     152881   8.822477
     14793     177491   7.693308
     19953     191976   9.414946
     19953     192330   9.399245
     19953     192684   9.383597
     19953     193750   9.336790
     19953     194106   9.321262
     50961     226093  18.393887
     54075     254175  17.542579
     54075     255039  17.493546
     54224     258917  17.316161
     54224     262745  17.107036
     55053     267306  17.078164
     56026     276647  16.841162
     56026     284621  16.446938
     58931     308741  16.028145
     64579     353502  15.446528
     81552     416345  16.379291
    102796     585118  14.943147
    125723     837199  13.056405
    153081    1010078  13.160797
    182049    1067762  14.566122
    184647    1111130  14.249906
----------------------------------------

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]
  Powered by Linux