Re: [PATCH] mm: fix unevictable page reclaim when calling madvise_pageout

Minchan Kim <minchan@xxxxxxxxxx> · Wed, 30 Oct 2019 09:52:39 -0700

On Tue, Oct 29, 2019 at 06:45:12PM +0800, zhong jiang wrote:
> On 2019/10/29 17:40, Michal Hocko wrote:
> > On Tue 29-10-19 17:30:57, zhong jiang wrote:
> >> On 2019/10/29 16:11, Michal Hocko wrote:
> >>> [Cc Minchan]
> > [...]
> >>> Removing a long existing BUG_ON begs for a much better explanation.
> >>> shrink_page_list is not a trivial piece of code but I _suspect_ that
> >>> removing it should be ok for mapped pages at least (try_to_unmap) but I
> >>> am not so sure how unmapped unevictable pages are handled from top of my
> >>> head.
> >> As to the unmapped unevictable pages.  shrink_page_list has taken that into account.
> >>
> >> shinkr_page_list
> >>      page_evictable     --> will filter the unevictable pages to putback its lru.
> > Ohh, it is right there at the top. Missed it. The check has been added
> > by Nick along with the BUG_ON. So it is sounds more like a "this
> > shouldn't happen" bugon. I wouldn't mind to remove it with that
> > justification.
> As you has said,   Minchan fix the same kind of bug by checking PageUnevictable (I did not notice before)
> Wait for Minchan to see whether  he has better reason. thanks,

madvise_pageout could work with a shared page and one of the vmas among processes
could do mlock so it could pass Unevictable LRU pages into shrink_page_list.
It's pointless to try reclaim unevictable pages from the beginning so I want to fix
madvise_pageout via introducing only_evictable flag into the API so that
madvise_pageout uses it as "true".

If we want to remove the PageUnevictable VM_BUG_ON_PAGE in shrink_page_list,
I want to see more strong reason why it happens and why caller couldn't
filter them out from the beginning.

diff --git a/mm/gup.c b/mm/gup.c
index 8f236a335ae9..d1ad1c3ec596 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1468,7 +1468,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
 					drain_allow = false;
 				}
 
-				if (!isolate_lru_page(head)) {
+				if (!isolate_lru_page(head, false)) {
 					list_add_tail(&head->lru, &cma_page_list);
 					mod_node_page_state(page_pgdat(head),
 							    NR_ISOLATED_ANON +
diff --git a/mm/internal.h b/mm/internal.h
index 0d5f720c75ab..13319612bef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,7 +85,7 @@ extern unsigned long highest_memmap_pfn;
 /*
  * in mm/vmscan.c:
  */
-extern int isolate_lru_page(struct page *page);
+extern int isolate_lru_page(struct page *page, bool only_evictable);
 extern void putback_lru_page(struct page *page);
 
 /*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0a1b4b484ac5..095560f7f8ec 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -609,7 +609,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		 * Isolate the page to avoid collapsing an hugepage
 		 * currently in use by the VM.
 		 */
-		if (isolate_lru_page(page)) {
+		if (isolate_lru_page(page, false)) {
 			unlock_page(page);
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
@@ -1642,7 +1642,7 @@ static void collapse_file(struct mm_struct *mm,
 			goto out_unlock;
 		}
 
-		if (isolate_lru_page(page)) {
+		if (isolate_lru_page(page, false)) {
 			result = SCAN_DEL_PAGE_LRU;
 			goto out_unlock;
 		}
diff --git a/mm/madvise.c b/mm/madvise.c
index 2be9f3fdb05e..2639de560a0b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -363,7 +363,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		ClearPageReferenced(page);
 		test_and_clear_page_young(page);
 		if (pageout) {
-			if (!isolate_lru_page(page))
+			if (!isolate_lru_page(page, true))
 				list_add(&page->lru, &page_list);
 		} else
 			deactivate_page(page);
@@ -441,7 +441,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		ClearPageReferenced(page);
 		test_and_clear_page_young(page);
 		if (pageout) {
-			if (!isolate_lru_page(page))
+			if (!isolate_lru_page(page, true))
 				list_add(&page->lru, &page_list);
 		} else
 			deactivate_page(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 363106578876..6d913215b074 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5847,7 +5847,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
-			if (!isolate_lru_page(page)) {
+			if (!isolate_lru_page(page, false)) {
 				if (!mem_cgroup_move_account(page, true,
 							     mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
@@ -5895,7 +5895,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			 */
 			if (PageTransCompound(page))
 				goto put;
-			if (!device && isolate_lru_page(page))
+			if (!device && isolate_lru_page(page, false))
 				goto put;
 			if (!mem_cgroup_move_account(page, false,
 						mc.from, mc.to)) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87dff73..ef37c67a7bab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -567,7 +567,7 @@ static const char * const action_page_types[] = {
  */
 static int delete_from_lru_cache(struct page *p)
 {
-	if (!isolate_lru_page(p)) {
+	if (!isolate_lru_page(p, false)) {
 		/*
 		 * Clear sensible page flags, so that the buddy system won't
 		 * complain when the page is unpoison-and-freed.
@@ -1782,7 +1782,7 @@ static int __soft_offline_page(struct page *page, int flags)
 	 * handles a large number of cases for us.
 	 */
 	if (PageLRU(page))
-		ret = isolate_lru_page(page);
+		ret = isolate_lru_page(page, false);
 	else
 		ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 	/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df570e5c71cc..8ba483d3d8cd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1314,7 +1314,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 */
 		if (PageHWPoison(page)) {
 			if (WARN_ON(PageLRU(page)))
-				isolate_lru_page(page);
+				isolate_lru_page(page, false);
 			if (page_mapped(page))
 				try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
 			continue;
@@ -1327,7 +1327,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * LRU and non-lru movable pages.
 		 */
 		if (PageLRU(page))
-			ret = isolate_lru_page(page);
+			ret = isolate_lru_page(page, false);
 		else
 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 		if (!ret) { /* Success */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ae967bcf954..585e5845f071 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -974,7 +974,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
 	 * Avoid migrating a page that is shared with others.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
-		if (!isolate_lru_page(head)) {
+		if (!isolate_lru_page(head, false)) {
 			list_add_tail(&head->lru, pagelist);
 			mod_node_page_state(page_pgdat(head),
 				NR_ISOLATED_ANON + page_is_file_cache(head),
diff --git a/mm/migrate.c b/mm/migrate.c
index 4fe45d1428c8..710e00317a8f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1563,7 +1563,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 		struct page *head;
 
 		head = compound_head(page);
-		err = isolate_lru_page(head);
+		err = isolate_lru_page(head, false);
 		if (err)
 			goto out_putpage;
 
@@ -1895,7 +1895,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 	if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
 		return 0;
 
-	if (isolate_lru_page(page))
+	if (isolate_lru_page(page, false))
 		return 0;
 
 	/*
@@ -2450,7 +2450,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
 				allow_drain = false;
 			}
 
-			if (isolate_lru_page(page)) {
+			if (isolate_lru_page(page, false)) {
 				if (remap) {
 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 					migrate->cpages--;
diff --git a/mm/mlock.c b/mm/mlock.c
index a72c1eeded77..307e340fe2e0 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -70,7 +70,7 @@ void clear_page_mlock(struct page *page)
 	 *
 	 * See __pagevec_lru_add_fn for more explanation.
 	 */
-	if (!isolate_lru_page(page)) {
+	if (!isolate_lru_page(page, false)) {
 		putback_lru_page(page);
 	} else {
 		/*
@@ -97,7 +97,7 @@ void mlock_vma_page(struct page *page)
 		mod_zone_page_state(page_zone(page), NR_MLOCK,
 				    hpage_nr_pages(page));
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
-		if (!isolate_lru_page(page))
+		if (!isolate_lru_page(page, false))
 			putback_lru_page(page);
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee4eecc7e1c2..c44fb52c745f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1793,7 +1793,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  * (2) the lru_lock must not be held.
  * (3) interrupts must be enabled.
  */
-int isolate_lru_page(struct page *page)
+int isolate_lru_page(struct page *page, bool only_evictable)
 {
 	int ret = -EBUSY;
 
@@ -1805,6 +1805,8 @@ int isolate_lru_page(struct page *page)
 		struct lruvec *lruvec;
 
 		spin_lock_irq(&pgdat->lru_lock);
+		if (only_evictable && PageUnevictable(page))
+			goto out;
 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		if (PageLRU(page)) {
 			int lru = page_lru(page);
@@ -1813,6 +1815,7 @@ int isolate_lru_page(struct page *page)
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
 		}
+out:
 		spin_unlock_irq(&pgdat->lru_lock);
 	}
 	return ret;