On 09/25/23 16:28, riel@xxxxxxxxxxx wrote: > From: Rik van Riel <riel@xxxxxxxxxxx> > > Malloc libraries, like jemalloc and tcalloc, take decisions on when > to call madvise independently from the code in the main application. > > This sometimes results in the application page faulting on an address, > right after the malloc library has shot down the backing memory with > MADV_DONTNEED. > > Usually this is harmless, because we always have some 4kB pages > sitting around to satisfy a page fault. However, with hugetlbfs > systems often allocate only the exact number of huge pages that > the application wants. > > Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of > any lock taken on the page fault path, which can open up the following > race condition: > > CPU 1 CPU 2 > > MADV_DONTNEED > unmap page > shoot down TLB entry > page fault > fail to allocate a huge page > killed with SIGBUS > free page > > Fix that race by pulling the locking from __unmap_hugepage_final_range > into helper functions called from zap_page_range_single. This ensures > page faults stay locked out of the MADV_DONTNEED VMA until the > huge pages have actually been freed. > > Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx> > --- > include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++-- > mm/hugetlb.c | 20 +++++++++++--------- > mm/memory.c | 7 +++---- > 3 files changed, 47 insertions(+), 15 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 694928fa06a3..d9ec500cfef9 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, > void unmap_hugepage_range(struct vm_area_struct *, > unsigned long, unsigned long, struct page *, > zap_flags_t); > -void __unmap_hugepage_range_final(struct mmu_gather *tlb, > +void __unmap_hugepage_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > unsigned long start, unsigned long end, > struct page *ref_page, zap_flags_t zap_flags); > @@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, > void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, > unsigned long *start, unsigned long *end); > > +extern void __hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *begin, unsigned long *end); > +extern void __hugetlb_zap_end(struct vm_area_struct *vma, > + struct zap_details *details); > + > +static inline void hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > + if (is_vm_hugetlb_page(vma)) > + __hugetlb_zap_begin(vma, start, end); > +} > + > +static inline void hugetlb_zap_end(struct vm_area_struct *vma, > + struct zap_details *details) > +{ > + if (is_vm_hugetlb_page(vma)) > + __hugetlb_zap_end(vma, details); > +} > + > void hugetlb_vma_lock_read(struct vm_area_struct *vma); > void hugetlb_vma_unlock_read(struct vm_area_struct *vma); > void hugetlb_vma_lock_write(struct vm_area_struct *vma); > @@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible( > { > } > > +static inline void hugetlb_zap_begin( > + struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > +} > + > +static inline void hugetlb_zap_end( > + struct vm_area_struct *vma, > + struct zap_details *details) > +{ > +} > + > static inline struct page *hugetlb_follow_page_mask( > struct vm_area_struct *vma, unsigned long address, unsigned int flags, > unsigned int *page_mask) > @@ -442,7 +473,7 @@ static inline long hugetlb_change_protection( > return 0; > } > > -static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, > +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, unsigned long start, > unsigned long end, struct page *ref_page, > zap_flags_t zap_flags) > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index e859fba5bc7d..5f8b82e902a8 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, > return len + old_addr - old_end; > } > > -static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > - unsigned long start, unsigned long end, > - struct page *ref_page, zap_flags_t zap_flags) > +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > + unsigned long start, unsigned long end, > + struct page *ref_page, zap_flags_t zap_flags) > { > struct mm_struct *mm = vma->vm_mm; > unsigned long address; > @@ -5435,16 +5435,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct > tlb_flush_mmu_tlbonly(tlb); > } > > -void __unmap_hugepage_range_final(struct mmu_gather *tlb, > - struct vm_area_struct *vma, unsigned long start, > - unsigned long end, struct page *ref_page, > - zap_flags_t zap_flags) > +void __hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > { > + adjust_range_if_pmd_sharing_possible(vma, start, end); > hugetlb_vma_lock_write(vma); > i_mmap_lock_write(vma->vm_file->f_mapping); > +} __unmap_hugepage_range_final() was called from unmap_single_vma. unmap_single_vma has two callers, zap_page_range_single and unmap_vmas. When the locking was moved into hugetlb_zap_begin, it was only added to the zap_page_range_single call path. Calls from unmap_vmas are missing the locking. -- Mike Kravetz > > - /* mmu notification performed in caller */ > - __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); > +void __hugetlb_zap_end(struct vm_area_struct *vma, > + struct zap_details *details) > +{ > + zap_flags_t zap_flags = details ? details->zap_flags : 0; > > if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ > /* > diff --git a/mm/memory.c b/mm/memory.c > index 6c264d2f969c..a07ae3b60530 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1683,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, > if (vma->vm_file) { > zap_flags_t zap_flags = details ? > details->zap_flags : 0; > - __unmap_hugepage_range_final(tlb, vma, start, end, > + __unmap_hugepage_range(tlb, vma, start, end, > NULL, zap_flags); > } > } else > @@ -1753,9 +1753,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, > lru_add_drain(); > mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, > address, end); > - if (is_vm_hugetlb_page(vma)) > - adjust_range_if_pmd_sharing_possible(vma, &range.start, > - &range.end); > + hugetlb_zap_begin(vma, &range.start, &range.end); > tlb_gather_mmu(&tlb, vma->vm_mm); > update_hiwater_rss(vma->vm_mm); > mmu_notifier_invalidate_range_start(&range); > @@ -1766,6 +1764,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, > unmap_single_vma(&tlb, vma, address, end, details, false); > mmu_notifier_invalidate_range_end(&range); > tlb_finish_mmu(&tlb); > + hugetlb_zap_end(vma, details); > } > > /** > -- > 2.41.0 >