On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@xxxxxxxxxxxx> wrote: > > From: David Stevens <stevensd@xxxxxxxxxxxx> > > Collapsing memory in a vma that has an armed userfaultfd results in > zero-filling any missing pages, which breaks user-space paging for those > filled pages. Avoid khugepage bypassing userfaultfd by not collapsing > pages in shmem reached via scanning a vma with an armed userfaultfd if > doing so would zero-fill any pages. > > Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx> > --- > mm/khugepaged.c | 35 ++++++++++++++++++++++++----------- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 79be13133322..48e944fb8972 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, > * + restore gaps in the page cache; > * + unlock and free huge page; > */ > -static int collapse_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > struct address_space *mapping = file->f_mapping; > @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > * be able to map it or use it in another way until we unlock it. > */ > > + if (is_shmem) > + mmap_read_lock(mm); If you release mmap_lock before then reacquire it here, the vma is not trusted anymore. It is not safe to use the vma anymore. Since you already read uffd_was_armed before releasing mmap_lock, so you could pass it directly to collapse_file w/o dereferencing vma again. The problem may be false positive (not userfaultfd armed anymore), but it should be fine. Khugepaged could collapse this area in the next round. Also +userfaultfd folks. > + > xas_set(&xas, start); > for (index = start; index < end; index++) { > struct page *page = xas_next(&xas); > @@ -1792,6 +1795,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > VM_BUG_ON(index != xas.xa_index); > if (is_shmem) { > if (!page) { > + if (userfaultfd_armed(vma)) { > + result = SCAN_EXCEED_NONE_PTE; > + goto xa_locked; > + } > /* > * Stop if extent has been truncated or > * hole-punched, and is now completely > @@ -2095,6 +2102,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > hpage->mapping = NULL; > } > > + if (is_shmem) > + mmap_read_unlock(mm); > if (hpage) > unlock_page(hpage); > out: > @@ -2108,8 +2117,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > return result; > } > > -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > struct page *page = NULL; > @@ -2118,6 +2127,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > int present, swap; > int node = NUMA_NO_NODE; > int result = SCAN_SUCCEED; > + bool uffd_was_armed = userfaultfd_armed(vma); > + > + mmap_read_unlock(mm); > > present = 0; > swap = 0; > @@ -2193,13 +2205,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > } > rcu_read_unlock(); > > + if (uffd_was_armed && present < HPAGE_PMD_NR) > + result = SCAN_EXCEED_SWAP_PTE; > + > if (result == SCAN_SUCCEED) { > if (cc->is_khugepaged && > present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { > result = SCAN_EXCEED_NONE_PTE; > count_vm_event(THP_SCAN_EXCEED_NONE_PTE); > } else { > - result = collapse_file(mm, addr, file, start, cc); > + result = collapse_file(mm, vma, addr, file, start, cc); > } > } > > @@ -2207,8 +2222,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > return result; > } > #else > -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, > - struct file *file, pgoff_t start, > +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma, > + unsigned long addr, struct file *file, pgoff_t start, > struct collapse_control *cc) > { > BUILD_BUG(); > @@ -2304,8 +2319,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > pgoff_t pgoff = linear_page_index(vma, > khugepaged_scan.address); > > - mmap_read_unlock(mm); > - *result = hpage_collapse_scan_file(mm, > + *result = hpage_collapse_scan_file(mm, vma, > khugepaged_scan.address, > file, pgoff, cc); > mmap_locked = false; > @@ -2656,9 +2670,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > struct file *file = get_file(vma->vm_file); > pgoff_t pgoff = linear_page_index(vma, addr); > > - mmap_read_unlock(mm); > mmap_locked = false; > - result = hpage_collapse_scan_file(mm, addr, file, pgoff, > + result = hpage_collapse_scan_file(mm, vma, addr, file, pgoff, > cc); > fput(file); > } else { > -- > 2.39.1.456.gfc5497dd1b-goog > >