Page mapped at pte level can also be huge page when ARM CONT_PTE or RISC-V SVNAPOT is applied. Lack of huge pte handling logic in follow_page_pte() may lead to both performance and correctness issues. For example, on RISC-V platform, pages in the same 64K huge page have the same pte value, which means follow_page_pte() will get the same page for all of them using pte_pfn(). Then __get_user_pages() will return an array of pages with the same pfn. Mapping these pages causes memory confusion. This error can be triggered by the following code: void *addr = mmap(NULL, 0x10000, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0); struct vfio_iommu_type1_dma_map dmap_map = { .argsz = sizeof(dma_map), .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, .vaddr = (uint64_t)addr, .size = 0x10000, }; ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); This commit supplies huge pte handling logic in follow_page_pte() to avoid such problems. Signed-off-by: Xu Lu <luxu.kernel@xxxxxxxxxxxxx> --- arch/riscv/include/asm/pgtable.h | 6 ++++++ include/linux/pgtable.h | 8 ++++++++ mm/gup.c | 17 +++++++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 050fdc49b5ad7..40ae5979dd82c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -800,6 +800,12 @@ static inline bool pud_user_accessible_page(pud_t pud) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define pte_trans_huge pte_trans_huge +static inline int pte_trans_huge(pte_t pte) +{ + return pte_huge(pte) && pte_napot(pte); +} + static inline int pmd_trans_huge(pmd_t pmd) { return pmd_leaf(pmd); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372e..3f57ee6dcf017 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1584,6 +1584,14 @@ static inline unsigned long my_zero_pfn(unsigned long addr) #ifdef CONFIG_MMU +#if (defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(pte_trans_huge)) || \ + (!defined(CONFIG_TRANSPARENT_HUGEPAGE)) +static inline int pte_trans_huge(pte_t pte) +{ + return 0; +} +#endif + #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/mm/gup.c b/mm/gup.c index 3883b307780ea..67981ee28df86 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -838,7 +838,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, - struct dev_pagemap **pgmap) + struct follow_page_context *ctx) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; @@ -879,8 +879,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, * case since they are only valid while holding the pgmap * reference. */ - *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); - if (*pgmap) + ctx->pgmap = get_dev_pagemap(pte_pfn(pte), ctx->pgmap); + if (ctx->pgmap) page = pte_page(pte); else goto no_page; @@ -940,6 +940,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ folio_mark_accessed(folio); } + if (is_vm_hugetlb_page(vma) || pte_trans_huge(pte)) { + ctx->page_mask = (1 << folio_order(folio)) - 1; + page = folio_page(folio, 0) + + ((address & (folio_size(folio) - 1)) >> PAGE_SHIFT); + } out: pte_unmap_unlock(ptep, ptl); return page; @@ -975,7 +980,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); } if (likely(!pmd_leaf(pmdval))) - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags, ctx); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); @@ -988,14 +993,14 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags, ctx); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : - follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + follow_page_pte(vma, address, pmd, flags, ctx); } page = follow_huge_pmd(vma, address, pmd, flags, ctx); spin_unlock(ptl); -- 2.20.1