[RFC 2/4] Add non-swap page flag to mark a page will not swap

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



After a page marked non-swap flag in swap driver, it will add to
unevictable lru list.
This page will be kept in this status before its data changed.

Signed-off-by: Hui Zhu <zhuhui@xxxxxxxxxx>
---
 fs/proc/meminfo.c              |  6 ++++++
 include/linux/mm_inline.h      | 20 ++++++++++++++++++--
 include/linux/mmzone.h         |  3 +++
 include/linux/page-flags.h     |  8 ++++++++
 include/trace/events/mmflags.h |  9 ++++++++-
 kernel/events/uprobes.c        | 16 +++++++++++++++-
 mm/Kconfig                     |  5 +++++
 mm/memory.c                    | 34 ++++++++++++++++++++++++++++++++++
 mm/migrate.c                   |  4 ++++
 mm/mprotect.c                  |  8 ++++++++
 mm/vmscan.c                    | 41 ++++++++++++++++++++++++++++++++++++++++-
 11 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c81..5c79b2e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -79,6 +79,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
 		"SwapTotal:      %8lu kB\n"
 		"SwapFree:       %8lu kB\n"
+#ifdef CONFIG_NON_SWAP
+		"NonSwap:        %8lu kB\n"
+#endif
 		"Dirty:          %8lu kB\n"
 		"Writeback:      %8lu kB\n"
 		"AnonPages:      %8lu kB\n"
@@ -138,6 +141,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
+#ifdef CONFIG_NON_SWAP
+		K(global_page_state(NR_NON_SWAP)),
+#endif
 		K(global_node_page_state(NR_FILE_DIRTY)),
 		K(global_node_page_state(NR_WRITEBACK)),
 		K(global_node_page_state(NR_ANON_MAPPED)),
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 71613e8..92298ce 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -46,15 +46,31 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
+	int nr_pages = hpage_nr_pages(page);
+	enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+	if (PageNonSwap(page)) {
+		lru = LRU_UNEVICTABLE;
+		update_lru_size(lruvec, NR_NON_SWAP, zid, nr_pages);
+	}
+#endif
+	update_lru_size(lruvec, lru, zid, nr_pages);
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
+	int nr_pages = hpage_nr_pages(page);
+	enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+	if (PageNonSwap(page)) {
+		lru = LRU_UNEVICTABLE;
+		update_lru_size(lruvec, NR_NON_SWAP, zid, -nr_pages);
+	}
+#endif
 	list_del(&page->lru);
-	update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
+	update_lru_size(lruvec, lru, zid, -nr_pages);
 }
 
 /**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78..da08d20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,9 @@ enum zone_stat_item {
 	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_FREE_CMA_PAGES,
+#ifdef CONFIG_NON_SWAP
+	NR_NON_SWAP,
+#endif
 	NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda..0cd80db9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -105,6 +105,9 @@ enum pageflags {
 	PG_young,
 	PG_idle,
 #endif
+#ifdef CONFIG_NON_SWAP
+	PG_non_swap,
+#endif
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -303,6 +306,11 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 	TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
+#ifdef CONFIG_NON_SWAP
+PAGEFLAG(NonSwap, non_swap, PF_NO_TAIL)
+	TESTSCFLAG(NonSwap, non_swap, PF_NO_TAIL)
+#endif
+
 #ifdef CONFIG_HIGHMEM
 /*
  * Must use a macro here due to header dependency issues. page_zone() is not
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab4..1c0ccc9 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -79,6 +79,12 @@
 #define IF_HAVE_PG_IDLE(flag,string)
 #endif
 
+#ifdef CONFIG_NON_SWAP
+#define IF_HAVE_PG_NON_SWAP(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_NON_SWAP(flag,string)
+#endif
+
 #define __def_pageflag_names						\
 	{1UL << PG_locked,		"locked"	},		\
 	{1UL << PG_error,		"error"		},		\
@@ -104,7 +110,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
 IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
 IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
-IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
+IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
+IF_HAVE_PG_NON_SWAP(PG_non_swap,	"non_swap"	)
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525a..a7e4153 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -160,6 +160,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	const unsigned long mmun_start = addr;
 	const unsigned long mmun_end   = addr + PAGE_SIZE;
 	struct mem_cgroup *memcg;
+	pte_t pte;
+#ifdef CONFIG_NON_SWAP
+	bool non_swap;
+#endif
 
 	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
 			false);
@@ -176,6 +180,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		goto unlock;
 
 	get_page(kpage);
+#ifdef CONFIG_NON_SWAP
+	non_swap = TestClearPageNonSwap(page);
+	if (non_swap)
+		SetPageNonSwap(kpage);
+#endif
 	page_add_new_anon_rmap(kpage, vma, addr, false);
 	mem_cgroup_commit_charge(kpage, memcg, false, false);
 	lru_cache_add_active_or_unevictable(kpage, vma);
@@ -187,7 +196,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
 	ptep_clear_flush_notify(vma, addr, ptep);
-	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+	pte = mk_pte(kpage, vma->vm_page_prot);
+#ifdef CONFIG_NON_SWAP
+	if (non_swap)
+		pte = pte_wrprotect(pte);
+#endif
+	set_pte_at_notify(mm, addr, ptep, pte);
 
 	page_remove_rmap(page, false);
 	if (!page_mapped(page))
diff --git a/mm/Kconfig b/mm/Kconfig
index 57ecdb3..d8d4b41 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -708,3 +708,8 @@ config ARCH_HAS_PKEYS
 config LATE_UNMAP
 	bool
 	depends on SWAP
+
+config NON_SWAP
+	bool
+	depends on SWAP
+	select LATE_UNMAP
diff --git a/mm/memory.c b/mm/memory.c
index 83be99d..2448004 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,7 @@
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
+#include <linux/mm_inline.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2338,6 +2339,26 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
 	return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
 }
 
+#ifdef CONFIG_NON_SWAP
+static void
+clear_page_non_swap(struct page *page)
+{
+	struct zone *zone;
+	struct lruvec *lruvec;
+
+	if (!PageLRU(page) || !page_evictable(page))
+		return;
+
+	zone = page_zone(page);
+	spin_lock_irq(zone_lru_lock(zone));
+	__dec_zone_page_state(page, NR_NON_SWAP);
+	lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+	del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+	add_page_to_lru_list(page, lruvec, page_lru(page));
+	spin_unlock_irq(zone_lru_lock(zone));
+}
+#endif
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -2400,6 +2421,10 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
 			put_page(old_page);
 		}
 		if (reuse_swap_page(old_page, &total_mapcount)) {
+#ifdef CONFIG_NON_SWAP
+			if (unlikely(TestClearPageNonSwap(old_page)))
+				clear_page_non_swap(old_page);
+#endif
 			if (total_mapcount == 1) {
 				/*
 				 * The page is all ours. Move it to
@@ -2581,6 +2606,11 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
 		goto out_release;
 	}
 
+#ifdef CONFIG_NON_SWAP
+	if ((fe->flags & FAULT_FLAG_WRITE) && unlikely(TestClearPageNonSwap(page)))
+		clear_page_non_swap(page);
+#endif
+
 	/*
 	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
 	 * release the swapcache from under us.  The page pin, and pte_same
@@ -2638,6 +2668,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
 	flush_icache_page(vma, page);
 	if (pte_swp_soft_dirty(orig_pte))
 		pte = pte_mksoft_dirty(pte);
+#ifdef CONFIG_NON_SWAP
+	if (!(fe->flags & FAULT_FLAG_WRITE) && PageNonSwap(page))
+		pte = pte_wrprotect(pte);
+#endif
 	set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
 	if (page == swapcache) {
 		do_page_add_anon_rmap(page, vma, fe->address, exclusive);
diff --git a/mm/migrate.c b/mm/migrate.c
index f7ee04a..46ac926 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -640,6 +640,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 		SetPageChecked(newpage);
 	if (PageMappedToDisk(page))
 		SetPageMappedToDisk(newpage);
+#ifdef CONFIG_NON_SWAP
+	if (TestClearPageNonSwap(page))
+		SetPageNonSwap(newpage);
+#endif
 
 	/* Move dirty on pages not done by migrate_page_move_mapping() */
 	if (PageDirty(page))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a4830f0..6539c6e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -79,6 +79,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (pte_present(oldpte)) {
 			pte_t ptent;
 			bool preserve_write = prot_numa && pte_write(oldpte);
+#ifdef CONFIG_NON_SWAP
+			struct page *page;
+#endif
 
 			/*
 			 * Avoid trapping faults against the zero or KSM
@@ -107,6 +110,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 					 !(vma->vm_flags & VM_SOFTDIRTY))) {
 				ptent = pte_mkwrite(ptent);
 			}
+#ifdef CONFIG_NON_SWAP
+			page = vm_normal_page(vma, addr, oldpte);
+			if (page && PageNonSwap(page))
+				ptent = pte_wrprotect(ptent);
+#endif
 			ptep_modify_prot_commit(mm, addr, pte, ptent);
 			pages++;
 		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32fef7d..14d49cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -758,14 +758,38 @@ redo:
 	ClearPageUnevictable(page);
 
 	if (page_evictable(page)) {
+#ifdef CONFIG_NON_SWAP
+		bool added = false;
+
+		if (unlikely(PageNonSwap(page))) {
+			struct zone *zone = page_zone(page);
+
+			BUG_ON(irqs_disabled());
+
+			spin_lock_irq(zone_lru_lock(zone));
+			if (likely(PageNonSwap(page))) {
+				struct lruvec *lruvec;
+
+				lruvec = mem_cgroup_page_lruvec(page,
+							zone->zone_pgdat);
+				SetPageLRU(page);
+				add_page_to_lru_list(page, lruvec,
+						     LRU_UNEVICTABLE);
+				added = true;
+			}
+			spin_unlock_irq(zone_lru_lock(zone));
+		}
+
 		/*
 		 * For evictable pages, we can use the cache.
 		 * In event of a race, worst case is we end up with an
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
+		if (!added)
+#endif
+			lru_cache_add(page);
 		is_unevictable = false;
-		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
@@ -1199,6 +1223,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 					if (PageDirty(page))
 						goto keep_locked;
 
+#ifdef CONFIG_NON_SWAP
+					if (PageNonSwap(page)) {
+						try_to_free_swap(page);
+						unlock_page(page);
+						goto non_swap_keep;
+					}
+#endif
+
 					if (page_mapped(page) && mapping)
 						TRY_TO_UNMAP(page, ttu_flags);
 				}
@@ -1281,6 +1313,9 @@ cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		unlock_page(page);
+#ifdef CONFIG_NON_SWAP
+		ClearPageNonSwap(page);
+#endif
 		list_add(&page->lru, &ret_pages);
 		continue;
 
@@ -1294,6 +1329,10 @@ activate_locked:
 keep_locked:
 		unlock_page(page);
 keep:
+#ifdef CONFIG_NON_SWAP
+		ClearPageNonSwap(page);
+non_swap_keep:
+#endif
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]