On Thu, Dec 17, 2020 at 10:22:33AM -0800, Linus Torvalds wrote: > + head = xas_find(&xas, end_pgoff); > + for (; ; head = xas_next_entry(&xas, end_pgoff)) { > + if (!head) { > + rcu_read_unlock(); > + return; > + } > + if (likely(!xas_retry(&xas, head)) > + break; > + } > > instead. So that if we don't find any cached entries, we won't do > anything, rather than take locks and then not do anything. This should do. See below. > Then that second loop very naturally becomes a "do { } while ()" one. I don't see it. I haven't found a reasonable way to rework it do-while. diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..2825153ad0d6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -534,8 +534,8 @@ struct vm_fault { * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. @@ -972,7 +972,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page); + vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e237004d498d..869c1921ceda 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1259,6 +1259,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +/* + * the ordering of these checks is important for pmds with _page_devmap set. + * if we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static inline int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but diff --git a/mm/filemap.c b/mm/filemap.c index 0b2067b3c328..04975682296d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <asm/pgalloc.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -2831,10 +2832,74 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page, + struct xa_state *xas) +{ + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) + return true; + + if (xa_is_value(page)) + goto nohuge; + + if (!pmd_none(*vmf->pmd)) + goto nohuge; + + if (!PageTransHuge(page) || PageLocked(page)) + goto nohuge; + + if (!page_cache_get_speculative(page)) + goto nohuge; + + if (page != xas_reload(xas)) + goto unref; + + if (!PageTransHuge(page)) + goto unref; + + if (!PageUptodate(page) || PageReadahead(page) || PageHWPoison(page)) + goto unref; + + if (!trylock_page(page)) + goto unref; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + if (xas->xa_index >= DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE)) + goto unlock; + + do_set_pmd(vmf, page); + unlock_page(page); + return true; +unlock: + unlock_page(page); +unref: + put_page(page); +nohuge: + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(vma->vm_mm); + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return true; + + return false; +} + void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct file *file = vmf->vma->vm_file; + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; @@ -2843,20 +2908,37 @@ void filemap_map_pages(struct vm_fault *vmf, unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); - xas_for_each(&xas, head, end_pgoff) { + head = xas_find(&xas, end_pgoff); + for (; ; head = xas_next_entry(&xas, end_pgoff)) { + if (!head) { + rcu_read_unlock(); + return; + } + if (likely(!xas_retry(&xas, head))) + break; + } + + if (filemap_map_pmd(vmf, head, &xas)) { + rcu_read_unlock(); + return; + } + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + + for (; head; head = xas_next_entry(&xas, end_pgoff)) { if (xas_retry(&xas, head)) continue; if (xa_is_value(head)) - goto next; - + continue; /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ if (PageLocked(head)) - goto next; + continue; if (!page_cache_get_speculative(head)) - goto next; + continue; /* Has the page moved or been split? */ if (unlikely(head != xas_reload(&xas))) @@ -2884,19 +2966,18 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, page)) - goto unlock; + if (pte_none(*vmf->pte)) + do_set_pte(vmf, page); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); unlock_page(head); - goto next; + continue; unlock: unlock_page(head); skip: put_page(head); -next: - /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*vmf->pmd)) - break; } + pte_unmap_unlock(vmf->pte, vmf->ptl); rcu_read_unlock(); WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); } diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..96d62774096a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3490,7 +3490,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See the comment in pte_alloc_one_map() */ + /* See the comment in map_set_pte() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; @@ -3630,66 +3630,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - - if (!pmd_none(*vmf->pmd)) - goto map_pte; - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - goto map_pte; - } - - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - spin_unlock(vmf->ptl); - vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } -map_pte: - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of - * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge - * under us and then back to pmd_none, as a result of MADV_DONTNEED - * running immediately after a huge pmd fault in a different thread of - * this mm, in turn leading to a misleading pmd_trans_huge() retval. - * All we have to ensure is that it is a regular pmd that we can walk - * with pte_offset_map() and we can do that through an atomic read in - * C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - /* - * At this point we know that our vmf->pmd points to a page of ptes - * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() - * for the duration of the fault. If a racing MADV_DONTNEED runs and - * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still - * be valid and we will re-check to make sure the vmf->pte isn't - * pte_none() under vmf->ptl protection when we return to - * alloc_set_pte(). - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - return 0; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { @@ -3704,7 +3644,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3769,45 +3709,11 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -/** - * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the function allocates page table or use pre-allocated. - * - * @vmf: fault environment - * @page: page to map - * - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on - * return. - * - * Target users are page handler itself and implementations of - * vm_ops->map_pages. - * - * Return: %0 on success, %VM_FAULT_ code in case of error. - */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - vm_fault_t ret; - - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; - } - - if (!vmf->pte) { - ret = pte_alloc_one_map(vmf); - if (ret) - return ret; - } - - /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - return VM_FAULT_NOPAGE; - } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3824,14 +3730,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) page_add_file_rmap(page, false); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - - return 0; } - /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -3849,12 +3749,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) */ vm_fault_t finish_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; - vm_fault_t ret = 0; + vm_fault_t ret; /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && - !(vmf->vma->vm_flags & VM_SHARED)) + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; @@ -3863,13 +3763,35 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * check even for read faults because we might have lost our CoWed * page */ - if (!(vmf->vma->vm_flags & VM_SHARED)) - ret = check_stable_address_space(vmf->vma->vm_mm); - if (!ret) - ret = alloc_set_pte(vmf, page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); - return ret; + if (!(vma->vm_flags & VM_SHARED)) + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + /* Re-check under ptl */ + if (likely(pte_none(*vmf->pte))) + do_set_pte(vmf, page); + + update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; } static unsigned long fault_around_bytes __read_mostly = @@ -3980,7 +3902,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(vmf->pte, vmf->ptl); out: vmf->address = address; vmf->pte = NULL; @@ -4340,7 +4261,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = NULL; } else { - /* See comment in pte_alloc_one_map() */ + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* -- Kirill A. Shutemov