On 3/24/21 7:00 PM, Joao Martins wrote: > On 3/24/21 5:45 PM, Dan Williams wrote: >> On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@xxxxxxxxxx> wrote: >>> On 3/18/21 4:08 AM, Dan Williams wrote: >>>> Now that device-dax and filesystem-dax are guaranteed to unmap all user >>>> mappings of devmap / DAX pages before tearing down the 'struct page' >>>> array, get_user_pages_fast() can rely on its traditional synchronization >>>> method "validate_pte(); get_page(); revalidate_pte()" to catch races with >>>> device shutdown. Specifically the unmap guarantee ensures that gup-fast >>>> either succeeds in taking a page reference (lock-less), or it detects a >>>> need to fall back to the slow path where the device presence can be >>>> revalidated with locks held. >>> >>> [...] >>> >>> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after >>> try_grab_page() for checking pgmap type to see if we are in a device-dax >>> longterm pin? >> >> So, there is an effort to add a new pte bit p{m,u}d_special to disable >> gup-fast for huge pages [1]. I'd like to investigate whether we could >> use devmap + special as an encoding for "no longterm" and never >> consult the pgmap in the gup-fast path. >> >> [1]: https://lore.kernel.org/linux-mm/a1fa7fa2-914b-366d-9902-e5b784e8428c@xxxxxxxxxxxx/ >> > > Oh, nice! That would be ideal indeed, as we would skip the pgmap lookup enterily. > > I suppose device-dax would use pfn_t PFN_MAP while fs-dax memory device would set PFN_MAP > | PFN_DEV (provided vmf_insert_pfn_{pmd,pud} calls mkspecial on PFN_DEV). > > I haven't been following that thread, but for PMD/PUD special in vmf_* these might be useful: > > https://lore.kernel.org/linux-mm/20200110190313.17144-2-joao.m.martins@xxxxxxxxxx/ > https://lore.kernel.org/linux-mm/20200110190313.17144-4-joao.m.martins@xxxxxxxxxx/ > On a second thought, maybe it doesn't need to be that complicated for {fs,dev}dax if the added special bit is just a subcase of devmap pte/pmd/puds. See below scsissors mark as a rough estimation on the changes (nothing formal/proper as it isn't properly splitted). Running gup_test with devdax/fsdax FOLL_LONGTERM and without does the intended. (gup_test -m 16384 -r 10 -a -S -n 512 -w -f <file> [-F 0x10000]). Unless this is about using special PMD/PUD bits without page structs (thus without devmap bits) as in the previous two links. -- >8 -- Subject: mm/gup, nvdimm: allow FOLL_LONGTERM for device-dax gup-fast Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 47027796c2f9..8b5d68d89cde 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -439,11 +439,16 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd)) +#define pmd_special(pmd) pte_special(pmd_pte(pmd)) #endif static inline pmd_t pmd_mkdevmap(pmd_t pmd) { return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_SPECIAL))); +} #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b1099f2d9800..45449ee86d4f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -259,13 +259,13 @@ static inline int pmd_large(pmd_t pte) /* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { - return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; + return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline int pud_trans_huge(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; + return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE; } #endif @@ -297,6 +297,19 @@ static inline int pgd_devmap(pgd_t pgd) { return 0; } + +static inline bool pmd_special(pmd_t pmd) +{ + return !!(pmd_flags(pmd) & _PAGE_SPECIAL); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline bool pud_special(pud_t pud) +{ + return !!(pud_flags(pud) & _PAGE_SPECIAL); +} +#endif + #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -452,6 +465,11 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_DEVMAP); } +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); @@ -511,6 +529,11 @@ static inline pud_t pud_mkhuge(pud_t pud) return pud_set_flags(pud, _PAGE_PSE); } +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} + static inline pud_t pud_mkyoung(pud_t pud) { return pud_set_flags(pud, _PAGE_ACCESSED); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 16760b237229..156ceae33164 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev, pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->pfn_pad = resource_size(res) - range_len(&pmem->pgmap.range); - pmem->pfn_flags |= PFN_MAP; + pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL; bb_range = pmem->pgmap.range; bb_range.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { @@ -445,7 +445,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); - pmem->pfn_flags |= PFN_MAP; + pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL; bb_range = pmem->pgmap.range; } else { if (devm_add_action_or_reset(dev, pmem_release_queue, diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4ee6f734ba83..873c8e53c85d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -748,7 +748,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, *kaddr = fs->window_kaddr + offset; if (pfn) *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, - PFN_DEV | PFN_MAP); + PFN_DEV | PFN_MAP | PFN_SPECIAL); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7364e5a70228..ad7078e38ef2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1189,6 +1189,18 @@ static inline int pgd_devmap(pgd_t pgd) } #endif +#if !defined(CONFIG_ARCH_HAS_PTE_SPECIAL) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif + #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) @@ -1196,6 +1208,14 @@ static inline int pud_trans_huge(pud_t pud) { return 0; } +static inline bool pud_special(pud_t pud) +{ + return false; +} +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} #endif /* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ diff --git a/mm/gup.c b/mm/gup.c index b3e647c8b7ee..87aa229a9347 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2086,7 +2086,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, goto pte_unmap; if (pte_devmap(pte)) { - if (unlikely(flags & FOLL_LONGTERM)) + if (unlikely(flags & FOLL_LONGTERM) && pte_special(pte)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); @@ -2338,7 +2338,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; if (pmd_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) + if (unlikely(flags & FOLL_LONGTERM) && pmd_special(orig)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); @@ -2372,7 +2372,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; if (pud_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) + if (unlikely(flags & FOLL_LONGTERM) && pud_special(orig)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f6f70632fc29..9d5117711919 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -796,8 +796,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, } entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); - if (pfn_t_devmap(pfn)) + if (pfn_t_devmap(pfn)) { entry = pmd_mkdevmap(entry); + if (pfn_t_special(pfn)) + entry = pmd_mkspecial(entry); + } if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -896,8 +899,11 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } entry = pud_mkhuge(pfn_t_pud(pfn, prot)); - if (pfn_t_devmap(pfn)) + if (pfn_t_devmap(pfn)) { entry = pud_mkdevmap(entry); + if (pfn_t_special(pfn)) + entry = pud_mkspecial(entry); + } if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma);