Re: [PATCH 35/46] hugetlb: add MADV_COLLAPSE for hugetlb

Peter Xu <peterx@xxxxxxxxxx> · Tue, 17 Jan 2023 16:06:40 -0500

On Thu, Jan 05, 2023 at 10:18:33AM +0000, James Houghton wrote:
> This is a necessary extension to the UFFDIO_CONTINUE changes. When
> userspace finishes mapping an entire hugepage with UFFDIO_CONTINUE, the
> kernel has no mechanism to automatically collapse the page table to map
> the whole hugepage normally. We require userspace to inform us that they
> would like the mapping to be collapsed; they do this with MADV_COLLAPSE.
> 
> If userspace has not mapped all of a hugepage with UFFDIO_CONTINUE, but
> only some, hugetlb_collapse will cause the requested range to be mapped
> as if it were UFFDIO_CONTINUE'd already. The effects of any
> UFFDIO_WRITEPROTECT calls may be undone by a call to MADV_COLLAPSE for
> intersecting address ranges.
> 
> This commit is co-opting the same madvise mode that has been introduced
> to synchronously collapse THPs. The function that does THP collapsing
> has been renamed to madvise_collapse_thp.
> 
> As with the rest of the high-granularity mapping support, MADV_COLLAPSE
> is only supported for shared VMAs right now.
> 
> MADV_COLLAPSE has the same synchronization as huge_pmd_unshare.
> 
> Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
> ---
>  include/linux/huge_mm.h |  12 +--
>  include/linux/hugetlb.h |   8 ++
>  mm/hugetlb.c            | 164 ++++++++++++++++++++++++++++++++++++++++
>  mm/khugepaged.c         |   4 +-
>  mm/madvise.c            |  18 ++++-
>  5 files changed, 197 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index a1341fdcf666..5d1e3c980f74 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -218,9 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
>  
>  int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
>  		     int advice);
> -int madvise_collapse(struct vm_area_struct *vma,
> -		     struct vm_area_struct **prev,
> -		     unsigned long start, unsigned long end);
> +int madvise_collapse_thp(struct vm_area_struct *vma,
> +			 struct vm_area_struct **prev,
> +			 unsigned long start, unsigned long end);
>  void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
>  			   unsigned long end, long adjust_next);
>  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
> @@ -367,9 +367,9 @@ static inline int hugepage_madvise(struct vm_area_struct *vma,
>  	return -EINVAL;
>  }
>  
> -static inline int madvise_collapse(struct vm_area_struct *vma,
> -				   struct vm_area_struct **prev,
> -				   unsigned long start, unsigned long end)
> +static inline int madvise_collapse_thp(struct vm_area_struct *vma,
> +				       struct vm_area_struct **prev,
> +				       unsigned long start, unsigned long end)
>  {
>  	return -EINVAL;
>  }
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index c8524ac49b24..e1baf939afb6 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -1298,6 +1298,8 @@ bool hugetlb_hgm_eligible(struct vm_area_struct *vma);
>  int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
>  			      struct vm_area_struct *vma, unsigned long start,
>  			      unsigned long end);
> +int hugetlb_collapse(struct mm_struct *mm, struct vm_area_struct *vma,
> +		     unsigned long start, unsigned long end);
>  #else
>  static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
>  {
> @@ -1318,6 +1320,12 @@ int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
>  {
>  	return -EINVAL;
>  }
> +static inline
> +int hugetlb_collapse(struct mm_struct *mm, struct vm_area_struct *vma,
> +		     unsigned long start, unsigned long end)
> +{
> +	return -EINVAL;
> +}
>  #endif
>  
>  static inline spinlock_t *huge_pte_lock(struct hstate *h,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 5b6215e03fe1..388c46c7e77a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7852,6 +7852,170 @@ int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
>  	return 0;
>  }
>  
> +static bool hugetlb_hgm_collapsable(struct vm_area_struct *vma)
> +{
> +	if (!hugetlb_hgm_eligible(vma))
> +		return false;
> +	if (!vma->vm_private_data)	/* vma lock required for collapsing */
> +		return false;
> +	return true;
> +}
> +
> +/*
> + * Collapse the address range from @start to @end to be mapped optimally.
> + *
> + * This is only valid for shared mappings. The main use case for this function
> + * is following UFFDIO_CONTINUE. If a user UFFDIO_CONTINUEs an entire hugepage
> + * by calling UFFDIO_CONTINUE once for each 4K region, the kernel doesn't know
> + * to collapse the mapping after the final UFFDIO_CONTINUE. Instead, we leave
> + * it up to userspace to tell us to do so, via MADV_COLLAPSE.
> + *
> + * Any holes in the mapping will be filled. If there is no page in the
> + * pagecache for a region we're collapsing, the PTEs will be cleared.
> + *
> + * If high-granularity PTEs are uffd-wp markers, those markers will be dropped.
> + */
> +int hugetlb_collapse(struct mm_struct *mm, struct vm_area_struct *vma,
> +			    unsigned long start, unsigned long end)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct mmu_notifier_range range;
> +	struct mmu_gather tlb;
> +	unsigned long curr = start;
> +	int ret = 0;
> +	struct page *hpage, *subpage;
> +	pgoff_t idx;
> +	bool writable = vma->vm_flags & VM_WRITE;
> +	bool shared = vma->vm_flags & VM_SHARED;
> +	struct hugetlb_pte hpte;
> +	pte_t entry;
> +
> +	/*
> +	 * This is only supported for shared VMAs, because we need to look up
> +	 * the page to use for any PTEs we end up creating.
> +	 */
> +	if (!shared)
> +		return -EINVAL;
> +
> +	/* If HGM is not enabled, there is nothing to collapse. */
> +	if (!hugetlb_hgm_enabled(vma))
> +		return 0;
> +
> +	/*
> +	 * We lost the VMA lock after splitting, so we can't safely collapse.
> +	 * We could improve this in the future (like take the mmap_lock for
> +	 * writing and try again), but for now just fail with ENOMEM.
> +	 */
> +	if (unlikely(!hugetlb_hgm_collapsable(vma)))
> +		return -ENOMEM;
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
> +				start, end);
> +	mmu_notifier_invalidate_range_start(&range);
> +	tlb_gather_mmu(&tlb, mm);
> +
> +	/*
> +	 * Grab the VMA lock and mapping sem for writing. This will prevent
> +	 * concurrent high-granularity page table walks, so that we can safely
> +	 * collapse and free page tables.
> +	 *
> +	 * This is the same locking that huge_pmd_unshare requires.
> +	 */
> +	hugetlb_vma_lock_write(vma);
> +	i_mmap_lock_write(vma->vm_file->f_mapping);
> +
> +	while (curr < end) {
> +		ret = hugetlb_alloc_largest_pte(&hpte, mm, vma, curr, end);
> +		if (ret)
> +			goto out;
> +
> +		entry = huge_ptep_get(hpte.ptep);
> +
> +		/*
> +		 * There is no work to do if the PTE doesn't point to page
> +		 * tables.
> +		 */
> +		if (!pte_present(entry))
> +			goto next_hpte;
> +		if (hugetlb_pte_present_leaf(&hpte, entry))
> +			goto next_hpte;
> +
> +		idx = vma_hugecache_offset(h, vma, curr);
> +		hpage = find_get_page(mapping, idx);
> +
> +		if (hpage && !HPageMigratable(hpage)) {
> +			/*
> +			 * Don't collapse a mapping to a page that is pending
> +			 * a migration. Migration swap entries may have placed
> +			 * in the page table.
> +			 */
> +			ret = -EBUSY;
> +			put_page(hpage);
> +			goto out;
> +		}
> +
> +		if (hpage && PageHWPoison(hpage)) {
> +			/*
> +			 * Don't collapse a mapping to a page that is
> +			 * hwpoisoned.
> +			 */
> +			ret = -EHWPOISON;
> +			put_page(hpage);
> +			/*
> +			 * By setting ret to -EHWPOISON, if nothing else
> +			 * happens, we will tell userspace that we couldn't
> +			 * fully collapse everything due to poison.
> +			 *
> +			 * Skip this page, and continue to collapse the rest
> +			 * of the mapping.
> +			 */
> +			curr = (curr & huge_page_mask(h)) + huge_page_size(h);
> +			continue;
> +		}
> +
> +		/*
> +		 * Clear all the PTEs, and drop ref/mapcounts
> +		 * (on tlb_finish_mmu).
> +		 */
> +		__unmap_hugepage_range(&tlb, vma, curr,
> +			curr + hugetlb_pte_size(&hpte),
> +			NULL,
> +			ZAP_FLAG_DROP_MARKER);
> +		/* Free the PTEs. */
> +		hugetlb_free_pgd_range(&tlb,
> +				curr, curr + hugetlb_pte_size(&hpte),
> +				curr, curr + hugetlb_pte_size(&hpte));
> +		if (!hpage) {
> +			huge_pte_clear(mm, curr, hpte.ptep,
> +					hugetlb_pte_size(&hpte));
> +			goto next_hpte;
> +		}
> +
> +		page_dup_file_rmap(hpage, true);
> +
> +		subpage = hugetlb_find_subpage(h, hpage, curr);
> +		entry = make_huge_pte_with_shift(vma, subpage,
> +						 writable, hpte.shift);
> +		set_huge_pte_at(mm, curr, hpte.ptep, entry);
> +next_hpte:
> +		curr += hugetlb_pte_size(&hpte);
> +
> +		if (curr < end) {
> +			/* Don't hold the VMA lock for too long. */
> +			hugetlb_vma_unlock_write(vma);
> +			cond_resched();
> +			hugetlb_vma_lock_write(vma);

The intention is good here but IIUC this will cause vma lock to be taken
after the i_mmap_rwsem, which can cause circular deadlocks.  If to do this
properly we'll need to also release the i_mmap_rwsem.

However it may make the resched() logic over complicated, meanwhile for 2M
huge pages I think this will be called for each 2M range which can be too
fine grained, so it looks like the "cur < end" check is a bit too aggresive.

The other thing is I noticed that the long period of mmu notifier
invalidate between start -> end will (in reallife VM context) causing vcpu
threads spinning.

I _think_ it's because is_page_fault_stale() (when during a vmexit
following a kvm page fault) always reports true during the long procedure
of MADV_COLLAPSE if to be called upon a large range, so even if we release
both locks here it may not tremedously on the VM migration use case because
of the long-standing mmu notifier invalidation procedure.

To summarize.. I think a simpler start version of hugetlb MADV_COLLAPSE can
drop this "if" block, and let the userapp decide the step size of COLLAPSE?

> +		}
> +	}
> +out:
> +	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	hugetlb_vma_unlock_write(vma);
> +	tlb_finish_mmu(&tlb);
> +	mmu_notifier_invalidate_range_end(&range);
> +	return ret;
> +}
> +
>  #endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */

-- 
Peter Xu