On Mon, 16 Oct 2017 23:10:02 -0400 jglisse@xxxxxxxxxx wrote: > From: Jérôme Glisse <jglisse@xxxxxxxxxx> > > + /* > + * No need to call mmu_notifier_invalidate_range() as we are > + * downgrading page table protection not changing it to point > + * to a new page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > if (pmdp) { > #ifdef CONFIG_FS_DAX_PMD > pmd_t pmd; > @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, > pmd = pmd_wrprotect(pmd); > pmd = pmd_mkclean(pmd); > set_pmd_at(vma->vm_mm, address, pmdp, pmd); > - mmu_notifier_invalidate_range(vma->vm_mm, start, end); Could the secondary TLB still see the mapping as dirty and propagate the dirty bit back? > unlock_pmd: > spin_unlock(ptl); > #endif > @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, > pte = pte_wrprotect(pte); > pte = pte_mkclean(pte); > set_pte_at(vma->vm_mm, address, ptep, pte); > - mmu_notifier_invalidate_range(vma->vm_mm, start, end); Ditto > unlock_pte: > pte_unmap_unlock(ptep, ptl); > } > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h > index 6866e8126982..49c925c96b8a 100644 > --- a/include/linux/mmu_notifier.h > +++ b/include/linux/mmu_notifier.h > @@ -155,7 +155,8 @@ struct mmu_notifier_ops { > * shared page-tables, it not necessary to implement the > * invalidate_range_start()/end() notifiers, as > * invalidate_range() alread catches the points in time when an > - * external TLB range needs to be flushed. > + * external TLB range needs to be flushed. For more in depth > + * discussion on this see Documentation/vm/mmu_notifier.txt > * > * The invalidate_range() function is called under the ptl > * spin-lock and not allowed to sleep. > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index c037d3d34950..ff5bc647b51d 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1186,8 +1186,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, > goto out_free_pages; > VM_BUG_ON_PAGE(!PageHead(page), page); > > + /* > + * Leave pmd empty until pte is filled note we must notify here as > + * concurrent CPU thread might write to new page before the call to > + * mmu_notifier_invalidate_range_end() happens which can lead to a > + * device seeing memory write in different order than CPU. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); > - /* leave pmd empty until pte is filled */ > > pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); > pmd_populate(vma->vm_mm, &_pmd, pgtable); > @@ -2026,8 +2033,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, > pmd_t _pmd; > int i; > > - /* leave pmd empty until pte is filled */ > - pmdp_huge_clear_flush_notify(vma, haddr, pmd); > + /* > + * Leave pmd empty until pte is filled note that it is fine to delay > + * notification until mmu_notifier_invalidate_range_end() as we are > + * replacing a zero pmd write protected page with a zero pte write > + * protected page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > + pmdp_huge_clear_flush(vma, haddr, pmd); Shouldn't the secondary TLB know if the page size changed? > > pgtable = pgtable_trans_huge_withdraw(mm, pmd); > pmd_populate(mm, &_pmd, pgtable); > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 1768efa4c501..63a63f1b536c 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -3254,9 +3254,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); > } else { > if (cow) { > + /* > + * No need to notify as we are downgrading page > + * table protection not changing it to point > + * to a new page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > huge_ptep_set_wrprotect(src, addr, src_pte); OK.. so we could get write faults on write accesses from the device. > - mmu_notifier_invalidate_range(src, mmun_start, > - mmun_end); > } > entry = huge_ptep_get(src_pte); > ptepage = pte_page(entry); > @@ -4288,7 +4293,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, > * and that page table be reused and filled with junk. > */ > flush_hugetlb_tlb_range(vma, start, end); > - mmu_notifier_invalidate_range(mm, start, end); > + /* > + * No need to call mmu_notifier_invalidate_range() we are downgrading > + * page table protection not changing it to point to a new page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > i_mmap_unlock_write(vma->vm_file->f_mapping); > mmu_notifier_invalidate_range_end(mm, start, end); > > diff --git a/mm/ksm.c b/mm/ksm.c > index 6cb60f46cce5..be8f4576f842 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, > * So we clear the pte and flush the tlb before the check > * this assure us that no O_DIRECT can happen after the check > * or in the middle of the check. > + * > + * No need to notify as we are downgrading page table to read > + * only not changing it to point to a new page. > + * > + * See Documentation/vm/mmu_notifier.txt > */ > - entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); > + entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); > /* > * Check that no O_DIRECT or similar I/O is in progress on the > * page > @@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, > } > > flush_cache_page(vma, addr, pte_pfn(*ptep)); > - ptep_clear_flush_notify(vma, addr, ptep); > + /* > + * No need to notify as we are replacing a read only page with another > + * read only page with the same content. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > + ptep_clear_flush(vma, addr, ptep); > set_pte_at_notify(mm, addr, ptep, newpte); > > page_remove_rmap(page, false); > diff --git a/mm/rmap.c b/mm/rmap.c > index 061826278520..6b5a0f219ac0 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -937,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, > #endif > } > > - if (ret) { > - mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); > + /* > + * No need to call mmu_notifier_invalidate_range() as we are > + * downgrading page table protection not changing it to point > + * to a new page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > + if (ret) > (*cleaned)++; > - } > } > > mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); > @@ -1424,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > if (pte_soft_dirty(pteval)) > swp_pte = pte_swp_mksoft_dirty(swp_pte); > set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); > + /* > + * No need to invalidate here it will synchronize on > + * against the special swap migration pte. > + */ > goto discard; > } > > @@ -1481,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > * will take care of the rest. > */ > dec_mm_counter(mm, mm_counter(page)); > + /* We have to invalidate as we cleared the pte */ > + mmu_notifier_invalidate_range(mm, address, > + address + PAGE_SIZE); > } else if (IS_ENABLED(CONFIG_MIGRATION) && > (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { > swp_entry_t entry; > @@ -1496,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > if (pte_soft_dirty(pteval)) > swp_pte = pte_swp_mksoft_dirty(swp_pte); > set_pte_at(mm, address, pvmw.pte, swp_pte); > + /* > + * No need to invalidate here it will synchronize on > + * against the special swap migration pte. > + */ > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(subpage) }; > pte_t swp_pte; > @@ -1507,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > WARN_ON_ONCE(1); > ret = false; > /* We have to invalidate as we cleared the pte */ > + mmu_notifier_invalidate_range(mm, address, > + address + PAGE_SIZE); > page_vma_mapped_walk_done(&pvmw); > break; > } > @@ -1514,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > /* MADV_FREE page check */ > if (!PageSwapBacked(page)) { > if (!PageDirty(page)) { > + /* Invalidate as we cleared the pte */ > + mmu_notifier_invalidate_range(mm, > + address, address + PAGE_SIZE); > dec_mm_counter(mm, MM_ANONPAGES); > goto discard; > } > @@ -1547,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > if (pte_soft_dirty(pteval)) > swp_pte = pte_swp_mksoft_dirty(swp_pte); > set_pte_at(mm, address, pvmw.pte, swp_pte); > - } else > + /* Invalidate as we cleared the pte */ > + mmu_notifier_invalidate_range(mm, address, > + address + PAGE_SIZE); > + } else { > + /* > + * We should not need to notify here as we reach this > + * case only from freeze_page() itself only call from > + * split_huge_page_to_list() so everything below must > + * be true: > + * - page is not anonymous > + * - page is locked > + * > + * So as it is a locked file back page thus it can not > + * be remove from the page cache and replace by a new > + * page before mmu_notifier_invalidate_range_end so no > + * concurrent thread might update its page table to > + * point at new page while a device still is using this > + * page. > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > dec_mm_counter(mm, mm_counter_file(page)); > + } > discard: > + /* > + * No need to call mmu_notifier_invalidate_range() it has be > + * done above for all cases requiring it to happen under page > + * table lock before mmu_notifier_invalidate_range_end() > + * > + * See Documentation/vm/mmu_notifier.txt > + */ > page_remove_rmap(subpage, PageHuge(page)); > put_page(page); > - mmu_notifier_invalidate_range(mm, address, > - address + PAGE_SIZE); > } > > mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); Looking at the patchset, I understand the efficiency, but I am concerned with correctness. Balbir Singh. -- To unsubscribe from this list: send the line "unsubscribe linux-next" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html