When we call __pte_alloc() or other friends, a huge pmd might be created from a different thread. This is why pmd_trans_unstable() will now be called after __pte_alloc() or other friends return. This patch add pmd_installed_type return for __pte_alloc() and other friends, then we can check the huge pmd through the return value instead of calling pmd_trans_unstable() again. This patch has no functional change, just some preparations for the future patches. Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- include/linux/mm.h | 20 +++++++++++++++++--- mm/debug_vm_pgtable.c | 2 +- mm/filemap.c | 11 +++++++---- mm/gup.c | 2 +- mm/internal.h | 3 ++- mm/memory.c | 39 ++++++++++++++++++++++++++------------- mm/migrate.c | 17 ++--------------- mm/mremap.c | 2 +- mm/userfaultfd.c | 24 +++++++++++++++--------- 9 files changed, 72 insertions(+), 48 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 706da081b9f8..52f36fde2f11 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2306,13 +2306,27 @@ static inline void pgtable_pte_page_dtor(struct page *page) dec_lruvec_page_state(page, NR_PAGETABLE); } -#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) +enum pmd_installed_type { + INSTALLED_PTE, + INSTALLED_HUGE_PMD, +}; + +static inline int pte_alloc(struct mm_struct *mm, pmd_t *pmd) +{ + if (unlikely(pmd_none(*(pmd)))) + return __pte_alloc(mm, pmd); + if (unlikely(is_huge_pmd(*pmd))) + return INSTALLED_HUGE_PMD; + + return INSTALLED_PTE; +} +#define pte_alloc pte_alloc #define pte_alloc_map(mm, pmd, address) \ - (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) + (pte_alloc(mm, pmd) < 0 ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - (pte_alloc(mm, pmd) ? \ + (pte_alloc(mm, pmd) < 0 ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..b8322c55e65d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1170,7 +1170,7 @@ static int __init init_args(struct pgtable_debug_args *args) args->start_pmdp = pmd_offset(args->pudp, 0UL); WARN_ON(!args->start_pmdp); - if (pte_alloc(args->mm, args->pmdp)) { + if (pte_alloc(args->mm, args->pmdp) < 0) { pr_err("Failed to allocate pte entries\n"); ret = -ENOMEM; goto error; diff --git a/mm/filemap.c b/mm/filemap.c index ff8d19b7ce1d..23363f8ddbbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3217,12 +3217,15 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } } - if (pmd_none(*vmf->pmd)) - pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); + if (pmd_none(*vmf->pmd)) { + int ret = pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) + if (unlikely(ret == INSTALLED_HUGE_PMD)) + goto out; + } else if (pmd_devmap_trans_unstable(vmf->pmd)) { + /* See comment in handle_pte_fault() */ goto out; + } return false; diff --git a/mm/gup.c b/mm/gup.c index 2c51e9748a6a..2def775232a3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -699,7 +699,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } else { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); - ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; + ret = pte_alloc(mm, pmd) < 0 ? -ENOMEM : 0; } return ret ? ERR_PTR(ret) : diff --git a/mm/internal.h b/mm/internal.h index 3b79a5c9427a..474d6e3443f8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -67,7 +67,8 @@ bool __folio_end_writeback(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); +enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd, + pgtable_t *pte); static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { diff --git a/mm/memory.c b/mm/memory.c index bec6a5d5ee7c..8a39c0e58324 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -437,8 +437,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) +enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd, + pgtable_t *pte) { + int ret = INSTALLED_PTE; spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ @@ -459,20 +461,26 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; + } else if (is_huge_pmd(*pmd)) { + /* See comment in handle_pte_fault() */ + ret = INSTALLED_HUGE_PMD; } spin_unlock(ptl); + + return ret; } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { + enum pmd_installed_type ret; pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; - pmd_install(mm, pmd, &new); + ret = pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); - return 0; + return ret; } int __pte_alloc_kernel(pmd_t *pmd) @@ -1813,7 +1821,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; - if (pte_alloc(mm, pmd)) + if (pte_alloc(mm, pmd) < 0) goto out; while (pages_to_write_in_pmd) { @@ -3713,6 +3721,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) struct page *page; vm_fault_t ret = 0; pte_t entry; + int alloc_ret; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -3728,11 +3737,11 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have mmap_read_lock(mm). */ - if (pte_alloc(vma->vm_mm, vmf->pmd)) + alloc_ret = pte_alloc(vma->vm_mm, vmf->pmd); + if (alloc_ret < 0) return VM_FAULT_OOM; - /* See comment in handle_pte_fault() */ - if (unlikely(pmd_trans_unstable(vmf->pmd))) + if (unlikely(alloc_ret == INSTALLED_HUGE_PMD)) return 0; /* Use the zero-page for reads */ @@ -4023,6 +4032,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) } if (pmd_none(*vmf->pmd)) { + int alloc_ret; + if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) @@ -4030,14 +4041,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf) } if (vmf->prealloc_pte) - pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); - else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) - return VM_FAULT_OOM; - } + alloc_ret = pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); + else + alloc_ret = pte_alloc(vma->vm_mm, vmf->pmd); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) + if (unlikely(alloc_ret != INSTALLED_PTE)) + return alloc_ret < 0 ? VM_FAULT_OOM : 0; + } else if (pmd_devmap_trans_unstable(vmf->pmd)) { + /* See comment in handle_pte_fault() */ return 0; + } vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..bdfdfd3b50be 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2731,21 +2731,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) goto abort; - /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). - */ - if (pte_alloc(mm, pmdp)) - goto abort; - - /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(pmdp))) + /* See the comment in do_anonymous_page() */ + if (unlikely(pte_alloc(mm, pmdp) != INSTALLED_PTE)) goto abort; if (unlikely(anon_vma_prepare(vma))) diff --git a/mm/mremap.c b/mm/mremap.c index c6e9da09dd0a..fc5c56858883 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -551,7 +551,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; } - if (pte_alloc(new_vma->vm_mm, new_pmd)) + if (pte_alloc(new_vma->vm_mm, new_pmd) < 0) break; move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr, need_rmap_locks); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff1..2cea08e7f076 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -592,15 +592,21 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, err = -EEXIST; break; } - if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd))) { - err = -ENOMEM; - break; - } - /* If an huge pmd materialized from under us fail */ - if (unlikely(pmd_trans_huge(*dst_pmd))) { - err = -EFAULT; - break; + + if (unlikely(pmd_none(dst_pmdval))) { + int ret = __pte_alloc(dst_mm, dst_pmd); + + /* + * If there is not enough memory or an huge pmd + * materialized from under us + */ + if (unlikely(ret < 0)) { + err = -ENOMEM; + break; + } else if (unlikely(ret == INSTALLED_HUGE_PMD)) { + err = -EFAULT; + break; + } } BUG_ON(pmd_none(*dst_pmd)); -- 2.11.0