Ever since commit b756a3b5e7ea ("mm: device exclusive memory access") we can return with a device-exclusive entry from page_vma_mapped_walk(). try_to_unmap_one() is not prepared for that, so teach it about these non-present nonswap PTEs. Before that, could we also have triggered this case with device-private entries? Unlikely. Note that we could currently only run into this case with device-exclusive entries on THPs. For order-0 folios, we still adjust the mapcount on conversion to device-exclusive, making the rmap walk abort early (folio_mapcount() == 0 and breaking swapout). We'll fix that next, now that try_to_unmap_one() can handle it. Further note that try_to_unmap() calls MMU notifiers and holds the folio lock, so any device-exclusive users should be properly prepared for this device-exclusive PTE to "vanish". Fixes: b756a3b5e7ea ("mm: device exclusive memory access") Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> --- mm/rmap.c | 53 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 65d9bbea16d0..12900f367a2a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1648,9 +1648,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + bool anon_exclusive, ret = true; pte_t pteval; struct page *subpage; - bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; @@ -1722,7 +1722,19 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - pfn = pte_pfn(ptep_get(pvmw.pte)); + /* + * We can end up here with selected non-swap entries that + * actually map pages similar to PROT_NONE; see + * page_vma_mapped_walk()->check_pte(). + */ + pteval = ptep_get(pvmw.pte); + if (likely(pte_present(pteval))) { + pfn = pte_pfn(pteval); + } else { + pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + } + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1778,7 +1790,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); - } else { + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + } else if (likely(pte_present(pteval))) { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { @@ -1796,6 +1810,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + } else { + pte_clear(mm, address, pvmw.pte); } /* @@ -1805,10 +1823,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); - /* Set the dirty flag on the folio now the pte is gone. */ - if (pte_dirty(pteval)) - folio_mark_dirty(folio); - /* Update high watermark before we lower rss */ update_hiwater_rss(mm); @@ -1822,8 +1836,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } - - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if (likely(pte_present(pteval)) && pte_unused(pteval) && + !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan @@ -1902,6 +1916,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } + + /* + * arch_unmap_one() is expected to be a NOP on + * architectures where we could have non-swp entries + * here, so we'll not check/care. + */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); @@ -1926,10 +1946,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (likely(pte_present(pteval))) { + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* -- 2.48.1