The patch titled Subject: hugetlb: add vma based lock for pmd sharing has been added to the -mm mm-unstable branch. Its filename is hugetlb-add-vma-based-lock-for-pmd-sharing.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/hugetlb-add-vma-based-lock-for-pmd-sharing.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Subject: hugetlb: add vma based lock for pmd sharing Date: Wed, 24 Aug 2022 10:57:55 -0700 Allocate a rw semaphore and hang off vm_private_data for synchronization use by vmas that could be involved in pmd sharing. Only add infrastructure for the new lock here. Actual use will be added in subsequent patch. Link: https://lkml.kernel.org/r/20220824175757.20590-7-mike.kravetz@xxxxxxxxxx Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx> Cc: James Houghton <jthoughton@xxxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Miaohe Lin <linmiaohe@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Mina Almasry <almasrymina@xxxxxxxxxx> Cc: Muchun Song <songmuchun@xxxxxxxxxxxxx> Cc: Naoya Horiguchi <naoya.horiguchi@xxxxxxxxx> Cc: Pasha Tatashin <pasha.tatashin@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: Prakash Sangappa <prakash.sangappa@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/hugetlb.h | 36 +++++++- kernel/fork.c | 6 - mm/hugetlb.c | 170 ++++++++++++++++++++++++++++++++++---- mm/rmap.c | 8 + 4 files changed, 197 insertions(+), 23 deletions(-) --- a/include/linux/hugetlb.h~hugetlb-add-vma-based-lock-for-pmd-sharing +++ a/include/linux/hugetlb.h @@ -126,7 +126,7 @@ struct hugepage_subpool *hugepage_new_su long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); -void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, @@ -214,6 +214,13 @@ struct page *follow_huge_pud(struct mm_s struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags); +void hugetlb_vma_lock_read(struct vm_area_struct *vma); +void hugetlb_vma_unlock_read(struct vm_area_struct *vma); +void hugetlb_vma_lock_write(struct vm_area_struct *vma); +void hugetlb_vma_unlock_write(struct vm_area_struct *vma); +int hugetlb_vma_trylock_write(struct vm_area_struct *vma); +void hugetlb_vma_assert_locked(struct vm_area_struct *vma); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -225,7 +232,7 @@ void hugetlb_unshare_all_pmds(struct vm_ #else /* !CONFIG_HUGETLB_PAGE */ -static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } @@ -336,6 +343,31 @@ static inline int prepare_hugepage_range return -EINVAL; } +static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + static inline int pmd_huge(pmd_t pmd) { return 0; --- a/kernel/fork.c~hugetlb-add-vma-based-lock-for-pmd-sharing +++ a/kernel/fork.c @@ -674,12 +674,10 @@ static __latent_entropy int dup_mmap(str } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); + hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ mas.index = tmp->vm_start; --- a/mm/hugetlb.c~hugetlb-add-vma-based-lock-for-pmd-sharing +++ a/mm/hugetlb.c @@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -1008,12 +1010,25 @@ static int is_vma_resv_set(struct vm_are return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + /* + * Clear vm_private_data + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + * - For shared mappings this is a per-vma semaphore that may be + * allocated below. + */ + vma->vm_private_data = (void *)0; if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + return; + + /* + * Allocate semaphore if pmd sharing is possible. + */ + hugetlb_vma_lock_alloc(vma); } /* @@ -1044,7 +1059,7 @@ void clear_vma_resv_huge_pages(struct vm kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -4623,16 +4638,21 @@ static void hugetlb_vm_op_open(struct vm resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + hugetlb_vma_lock_alloc(vma); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -6439,6 +6459,11 @@ bool hugetlb_reserve_pages(struct inode } /* + * vma specific semaphore used for pmd sharing synchronization + */ + hugetlb_vma_lock_alloc(vma); + + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves @@ -6461,12 +6486,11 @@ bool hugetlb_reserve_pages(struct inode resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6561,6 +6585,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6640,14 +6665,34 @@ static unsigned long page_table_shareabl } static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool check_vma_lock) { +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) - return true; - return false; + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (check_vma_lock && !vma->vm_private_data) + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; +} + +static bool vma_pmd_shareable(struct vm_area_struct *vma) +{ + unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return false; + + return __vma_aligned_range_pmd_shareable(vma, start, end, false); } static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, @@ -6656,15 +6701,11 @@ static bool vma_addr_pmd_shareable(struc unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; - return __vma_aligned_range_pmd_shareable(vma, start, end); + return __vma_aligned_range_pmd_shareable(vma, start, end, true); } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { -#ifdef CONFIG_USERFAULTFD - if (uffd_disable_huge_pmd_share(vma)) - return false; -#endif return vma_addr_pmd_shareable(vma, addr); } @@ -6695,6 +6736,95 @@ void adjust_range_if_pmd_sharing_possibl *end = ALIGN(*end, PUD_SIZE); } +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) + down_read((struct rw_semaphore *)vma->vm_private_data); +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) + up_read((struct rw_semaphore *)vma->vm_private_data); +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) + down_write((struct rw_semaphore *)vma->vm_private_data); +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) + up_write((struct rw_semaphore *)vma->vm_private_data); +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + if (!__vma_shareable_flags_pmd(vma)) + return 1; + + return down_write_trylock((struct rw_semaphore *)vma->vm_private_data); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) + lockdep_assert_held((struct rw_semaphore *) + vma->vm_private_data); +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. See comment in + * __unmap_hugepage_range_final about the neeed to check both + * VM_SHARED and VM_MAYSHARE in free path + */ + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + return; + + if (vma->vm_private_data) { + kfree(vma->vm_private_data); + vma->vm_private_data = NULL; + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct rw_semaphore *vma_sema; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + /* Check size/alignment for pmd sharing possible */ + if (!vma_pmd_shareable(vma)) + return; + + vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL); + if (!vma_sema) + /* + * If we can not allocate semaphore, then vma can not + * participate in pmd sharing. + */ + return; + + init_rwsem(vma_sema); + vma->vm_private_data = vma_sema; +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -6781,6 +6911,14 @@ int huge_pmd_unshare(struct mm_struct *m } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ +} + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { --- a/mm/rmap.c~hugetlb-add-vma-based-lock-for-pmd-sharing +++ a/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * page->flags PG_locked (lock_page) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock @@ -44,6 +44,12 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) */ #include <linux/mm.h> _ Patches currently in -mm which might be from mike.kravetz@xxxxxxxxxx are hugetlbfs-revert-use-i_mmap_rwsem-to-address-page-fault-truncate-race.patch hugetlbfs-revert-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch hugetlb-rename-remove_huge_page-to-hugetlb_delete_from_page_cache.patch hugetlb-handle-truncate-racing-with-page-faults.patch hugetlb-rename-vma_shareable-and-refactor-code.patch hugetlb-add-vma-based-lock-for-pmd-sharing.patch hugetlb-create-hugetlb_unmap_file_folio-to-unmap-single-file-folio.patch hugetlb-use-new-vma_lock-for-pmd-sharing-synchronization.patch