[PATCH] thp: Simplify splitting PMD mapping huge zero page

"Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> · Fri, 27 Mar 2020 20:03:53 +0300

Splitting PMD mapping huge zero page can be simplified a lot: we can
just unmap it and fallback to PTE handling.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
---
 mm/huge_memory.c | 57 ++++--------------------------------------------
 1 file changed, 4 insertions(+), 53 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42407e16bd80..ef6a6bcb291f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
-		unsigned long haddr, pmd_t *pmd)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-
-	/*
-	 * Leave pmd empty until pte is filled note that it is fine to delay
-	 * notification until mmu_notifier_invalidate_range_end() as we are
-	 * replacing a zero pmd write protected page with a zero pte write
-	 * protected page.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush(vma, haddr, pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-	pmd_populate(mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t *pte, entry;
-		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
-		entry = pte_mkspecial(entry);
-		pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*pte));
-		set_pte_at(mm, haddr, pte, entry);
-		pte_unmap(pte);
-	}
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(mm, pmd, pgtable);
-}
-
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long haddr, bool freeze)
 {
@@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	count_vm_event(THP_SPLIT_PMD);
 
-	if (!vma_is_anonymous(vma)) {
+	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		/*
 		 * We are going to unmap this huge page. So
@@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(mm, pmd);
-		if (vma_is_dax(vma))
+		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
 			return;
 		page = pmd_page(_pmd);
 		if (!PageDirty(page) && pmd_dirty(_pmd))
@@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		put_page(page);
 		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
 		return;
-	} else if (is_huge_zero_pmd(*pmd)) {
-		/*
-		 * FIXME: Do we want to invalidate secondary mmu by calling
-		 * mmu_notifier_invalidate_range() see comments below inside
-		 * __split_huge_pmd() ?
-		 *
-		 * We are going from a zero huge page write protected to zero
-		 * small page also write protected so it does not seems useful
-		 * to invalidate secondary mmu at this time.
-		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
 	/*
@@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	spin_unlock(ptl);
 	/*
 	 * No need to double call mmu_notifier->invalidate_range() callback.
-	 * They are 3 cases to consider inside __split_huge_pmd_locked():
+	 * They are 2 cases to consider inside __split_huge_pmd_locked():
 	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
-	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
-	 *    fault will trigger a flush_notify before pointing to a new page
-	 *    (it is fine if the secondary mmu keeps pointing to the old zero
-	 *    page in the meantime)
-	 *  3) Split a huge pmd into pte pointing to the same page. No need
+	 *  2) Split a huge pmd into pte pointing to the same page. No need
 	 *     to invalidate secondary tlb entry they are all still valid.
 	 *     any further changes to individual pte will notify. So no need
 	 *     to call mmu_notifier->invalidate_range()
-- 
2.26.0