+ mm-thp-swap-swapin-a-thp-as-a-whole.patch added to -mm tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The patch titled
     Subject: mm, THP, swap: swapin a THP as a whole
has been added to the -mm tree.  Its filename is
     mm-thp-swap-swapin-a-thp-as-a-whole.patch

This patch should soon appear at
    http://ozlabs.org/~akpm/mmots/broken-out/mm-thp-swap-swapin-a-thp-as-a-whole.patch
and later at
    http://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-swap-swapin-a-thp-as-a-whole.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next and is updated
there every 3-4 working days

------------------------------------------------------
From: Huang Ying <ying.huang@xxxxxxxxx>
Subject: mm, THP, swap: swapin a THP as a whole

With this patch, when page fault handler find a PMD swap mapping, it will
swap in a THP as a whole.  This avoids the overhead of
splitting/collapsing before/after the THP swapping.  And improves the swap
performance greatly for reduced page fault count etc.

do_huge_pmd_swap_page() is added in the patch to implement this.  It is
similar to do_swap_page() for normal page swapin.

If failing to allocate a THP, the huge swap cluster and the PMD swap
mapping will be split to fallback to normal page swapin.

If the huge swap cluster has been split already, the PMD swap mapping will
be split to fallback to normal page swapin.

Link: http://lkml.kernel.org/r/20180622035151.6676-10-ying.huang@xxxxxxxxx
Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx>
Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Shaohua Li <shli@xxxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: Minchan Kim <minchan@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Cc: Zi Yan <zi.yan@xxxxxxxxxxxxxx>
Cc: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---


diff -puN include/linux/huge_mm.h~mm-thp-swap-swapin-a-thp-as-a-whole include/linux/huge_mm.h
--- a/include/linux/huge_mm.h~mm-thp-swap-swapin-a-thp-as-a-whole
+++ a/include/linux/huge_mm.h
@@ -402,4 +402,13 @@ static inline gfp_t alloc_hugepage_direc
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifdef CONFIG_THP_SWAP
+extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+#else /* CONFIG_THP_SWAP */
+static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+	return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 #endif /* _LINUX_HUGE_MM_H */
diff -puN include/linux/swap.h~mm-thp-swap-swapin-a-thp-as-a-whole include/linux/swap.h
--- a/include/linux/swap.h~mm-thp-swap-swapin-a-thp-as-a-whole
+++ a/include/linux/swap.h
@@ -560,6 +560,15 @@ static inline struct page *lookup_swap_c
 	return NULL;
 }
 
+static inline struct page *read_swap_cache_async(swp_entry_t swp,
+						 gfp_t gft_mask,
+						 struct vm_area_struct *vma,
+						 unsigned long addr,
+						 bool do_poll)
+{
+	return NULL;
+}
+
 static inline int add_to_swap(struct page *page)
 {
 	return 0;
diff -puN mm/huge_memory.c~mm-thp-swap-swapin-a-thp-as-a-whole mm/huge_memory.c
--- a/mm/huge_memory.c~mm-thp-swap-swapin-a-thp-as-a-whole
+++ a/mm/huge_memory.c
@@ -33,6 +33,8 @@
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
 #include <linux/oom.h>
+#include <linux/delayacct.h>
+#include <linux/swap.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1610,6 +1612,174 @@ static void __split_huge_swap_pmd(struct
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 }
+
+static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			       unsigned long address, pmd_t orig_pmd)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	ptl = pmd_lock(mm, pmd);
+	if (pmd_same(*pmd, orig_pmd))
+		__split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd);
+	else
+		ret = -ENOENT;
+	spin_unlock(ptl);
+
+	return ret;
+}
+
+int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+	struct page *page;
+	struct mem_cgroup *memcg;
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	swp_entry_t entry;
+	pmd_t pmd;
+	int i, locked, exclusive = 0, ret = 0;
+
+	entry = pmd_to_swp_entry(orig_pmd);
+	VM_BUG_ON(non_swap_entry(entry));
+	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+retry:
+	page = lookup_swap_cache(entry, NULL, vmf->address);
+	if (!page) {
+		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
+					     haddr, false);
+		if (!page) {
+			/*
+			 * Back out if somebody else faulted in this pmd
+			 * while we released the pmd lock.
+			 */
+			if (likely(pmd_same(*vmf->pmd, orig_pmd))) {
+				ret = split_swap_cluster(entry, false);
+				/*
+				 * Retry if somebody else swap in the swap
+				 * entry
+				 */
+				if (ret == -EEXIST) {
+					ret = 0;
+					goto retry;
+				/* swapoff occurs under us */
+				} else if (ret == -EINVAL)
+					ret = 0;
+				else
+					goto fallback;
+			}
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+			goto out;
+		}
+
+		/* Had to read the page from swap area: Major fault */
+		ret = VM_FAULT_MAJOR;
+		count_vm_event(PGMAJFAULT);
+		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+	} else if (!PageTransCompound(page))
+		goto fallback;
+
+	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!locked) {
+		ret |= VM_FAULT_RETRY;
+		goto out_release;
+	}
+
+	/*
+	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+	 * release the swapcache from under us.  The page pin, and pmd_same
+	 * test below, are not enough to exclude that.  Even if it is still
+	 * swapcache, we need to check that the page's swap has not changed.
+	 */
+	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+		goto out_page;
+
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+				  &memcg, true)) {
+		ret = VM_FAULT_OOM;
+		goto out_page;
+	}
+
+	/*
+	 * Back out if somebody else already faulted in this pmd.
+	 */
+	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
+	spin_lock(vmf->ptl);
+	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+		goto out_nomap;
+
+	if (unlikely(!PageUptodate(page))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_nomap;
+	}
+
+	/*
+	 * The page isn't present yet, go ahead with the fault.
+	 *
+	 * Be careful about the sequence of operations here.
+	 * To get its accounting right, reuse_swap_page() must be called
+	 * while the page is counted on swap but not yet in mapcount i.e.
+	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+	 * must be called after the swap_free(), or it will never succeed.
+	 */
+
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+	pmd = mk_huge_pmd(page, vma->vm_page_prot);
+	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+		pmd = maybe_pmd_mkwrite(pmd_mkdirty(pmd), vma);
+		vmf->flags &= ~FAULT_FLAG_WRITE;
+		ret |= VM_FAULT_WRITE;
+		exclusive = RMAP_EXCLUSIVE;
+	}
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		flush_icache_page(vma, page + i);
+	if (pmd_swp_soft_dirty(orig_pmd))
+		pmd = pmd_mksoft_dirty(pmd);
+	do_page_add_anon_rmap(page, vma, haddr,
+			      exclusive | RMAP_COMPOUND);
+	mem_cgroup_commit_charge(page, memcg, true, true);
+	activate_page(page);
+	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+
+	swap_free(entry, true);
+	if (mem_cgroup_swap_full(page) ||
+	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+		try_to_free_swap(page);
+	unlock_page(page);
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		ret |= do_huge_pmd_wp_page(vmf, pmd);
+		if (ret & VM_FAULT_ERROR)
+			ret &= VM_FAULT_ERROR;
+		goto out;
+	}
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+	spin_unlock(vmf->ptl);
+out:
+	return ret;
+out_nomap:
+	mem_cgroup_cancel_charge(page, memcg, true);
+	spin_unlock(vmf->ptl);
+out_page:
+	unlock_page(page);
+out_release:
+	put_page(page);
+	return ret;
+fallback:
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!split_huge_swap_pmd(vmf->vma, vmf->pmd, vmf->address, orig_pmd))
+		ret = VM_FAULT_FALLBACK;
+	else
+		ret = 0;
+	if (page)
+		put_page(page);
+	return ret;
+}
 #else
 static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
 					 unsigned long haddr,
diff -puN mm/memory.c~mm-thp-swap-swapin-a-thp-as-a-whole mm/memory.c
--- a/mm/memory.c~mm-thp-swap-swapin-a-thp-as-a-whole
+++ a/mm/memory.c
@@ -4078,13 +4078,17 @@ static int __handle_mm_fault(struct vm_a
 
 		barrier();
 		if (unlikely(is_swap_pmd(orig_pmd))) {
-			VM_BUG_ON(thp_migration_supported() &&
-					  !is_pmd_migration_entry(orig_pmd));
-			if (is_pmd_migration_entry(orig_pmd))
+			if (thp_migration_supported() &&
+			    is_pmd_migration_entry(orig_pmd)) {
 				pmd_migration_entry_wait(mm, vmf.pmd);
-			return 0;
-		}
-		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+				return 0;
+			} else if (thp_swap_supported()) {
+				ret = do_huge_pmd_swap_page(&vmf, orig_pmd);
+				if (!(ret & VM_FAULT_FALLBACK))
+					return ret;
+			} else
+				VM_BUG_ON(1);
+		} else if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
_

Patches currently in -mm which might be from ying.huang@xxxxxxxxx are

mm-clear_huge_page-move-order-algorithm-into-a-separate-function.patch
mm-huge-page-copy-target-sub-page-last-when-copy-huge-page.patch
mm-hugetlbfs-rename-address-to-haddr-in-hugetlb_cow.patch
mm-hugetlbfs-pass-fault-address-to-cow-handler.patch
mm-swap-fix-race-between-swapoff-and-some-swap-operations.patch
mm-swap-fix-race-between-swapoff-and-some-swap-operations-v6.patch
mm-fix-race-between-swapoff-and-mincore.patch
mm-thp-swap-enable-pmd-swap-operations-for-config_thp_swap.patch
mm-thp-swap-make-config_thp_swap-depends-on-config_swap.patch
mm-thp-swap-support-pmd-swap-mapping-in-swap_duplicate.patch
mm-thp-swap-support-pmd-swap-mapping-in-swapcache_free_cluster.patch
mm-thp-swap-support-pmd-swap-mapping-in-free_swap_and_cache-swap_free.patch
mm-thp-swap-support-pmd-swap-mapping-when-splitting-huge-pmd.patch
mm-thp-swap-support-pmd-swap-mapping-in-split_swap_cluster.patch
mm-thp-swap-support-to-read-a-huge-swap-cluster-for-swapin-a-thp.patch
mm-thp-swap-swapin-a-thp-as-a-whole.patch
mm-thp-swap-support-to-count-thp-swapin-and-its-fallback.patch
mm-thp-swap-add-sysfs-interface-to-configure-thp-swapin.patch
mm-thp-swap-support-pmd-swap-mapping-in-swapoff.patch
mm-thp-swap-support-pmd-swap-mapping-in-madvise_free.patch
mm-cgroup-thp-swap-support-to-move-swap-account-for-pmd-swap-mapping.patch
mm-thp-swap-support-to-copy-pmd-swap-mapping-when-fork.patch
mm-thp-swap-free-pmd-swap-mapping-when-zap_huge_pmd.patch
mm-thp-swap-support-pmd-swap-mapping-for-madv_willneed.patch
mm-thp-swap-support-pmd-swap-mapping-in-mincore.patch
mm-thp-swap-support-pmd-swap-mapping-in-common-path.patch
mm-thp-swap-create-pmd-swap-mapping-when-unmap-the-thp.patch
mm-thp-avoid-to-split-thp-when-reclaim-madv_free-thp.patch

--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Kernel Archive]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]

  Powered by Linux