Splitting PMD mapping huge zero page can be simplified a lot: we can just unmap it and fallback to PTE handling. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> --- mm/huge_memory.c | 57 ++++-------------------------------------------- 1 file changed, 4 insertions(+), 53 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 42407e16bd80..ef6a6bcb291f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd) -{ - struct mm_struct *mm = vma->vm_mm; - pgtable_t pgtable; - pmd_t _pmd; - int i; - - /* - * Leave pmd empty until pte is filled note that it is fine to delay - * notification until mmu_notifier_invalidate_range_end() as we are - * replacing a zero pmd write protected page with a zero pte write - * protected page. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush(vma, haddr, pmd); - - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); - entry = pte_mkspecial(entry); - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); -} - static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr, bool freeze) { @@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); - if (!vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* * We are going to unmap this huge page. So @@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (vma_is_dax(vma)) + if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd)) return; page = pmd_page(_pmd); if (!PageDirty(page) && pmd_dirty(_pmd)) @@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, put_page(page); add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (is_huge_zero_pmd(*pmd)) { - /* - * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_invalidate_range() see comments below inside - * __split_huge_pmd() ? - * - * We are going from a zero huge page write protected to zero - * small page also write protected so it does not seems useful - * to invalidate secondary mmu at this time. - */ - return __split_huge_zero_page_pmd(vma, haddr, pmd); } /* @@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spin_unlock(ptl); /* * No need to double call mmu_notifier->invalidate_range() callback. - * They are 3 cases to consider inside __split_huge_pmd_locked(): + * They are 2 cases to consider inside __split_huge_pmd_locked(): * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious - * 2) __split_huge_zero_page_pmd() read only zero page and any write - * fault will trigger a flush_notify before pointing to a new page - * (it is fine if the secondary mmu keeps pointing to the old zero - * page in the meantime) - * 3) Split a huge pmd into pte pointing to the same page. No need + * 2) Split a huge pmd into pte pointing to the same page. No need * to invalidate secondary tlb entry they are all still valid. * any further changes to individual pte will notify. So no need * to call mmu_notifier->invalidate_range() -- 2.26.0