Re: [PATCH 1/1] mm/migrate: Trylock device page in do_swap_page

Alistair Popple <apopple@xxxxxxxxxx> · Wed, 11 Sep 2024 14:53:31 +1000

Matthew Brost <matthew.brost@xxxxxxxxx> writes:

> Avoid multiple CPU page faults to the same device page racing by locking
> the page in do_swap_page before taking an additional reference to the
> page. This prevents scenarios where multiple CPU page faults each take
> an extra reference to a device page, which could abort migration in
> folio_migrate_mapping. With the device page locked in do_swap_page, the
> migrate_vma_* functions need to be updated to avoid locking the
> fault_page argument.

I added the get_page() and therefore the fault_page argument to deal
with another lifetime issue. Neither Felix nor I particularly liked the
solution though (see
https://lore.kernel.org/all/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@xxxxxxxxxx/T/#m715589d57716448386ff9af41da27a952d94615a)
and this seems to make it even uglier, so I have suggested an
alternative solution below.

> Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> DRM driver) SVM implementation if enough threads faulted the same device
> page.

I haven't seen the same in the NVIDIA UVM driver (out-of-tree, I know)
but theoretically it seems like it should be possible. However we
serialize migrations of the same virtual address range to avoid these
kind of issues as they can happen the other way too (ie. multiple
threads trying to migrate to GPU).

So I suspect what happens in UVM is that one thread wins and installs
the migration entry while the others fail to get the driver migration
lock and bail out sufficiently early in the fault path to avoid the
live-lock.

> Cc: Philip Yang <Philip.Yang@xxxxxxx>
> Cc: Felix Kuehling <felix.kuehling@xxxxxxx>
> Cc: Christian König <christian.koenig@xxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Suggessted-by: Simona Vetter <simona.vetter@xxxxxxxx>
> Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx>
> ---
>  mm/memory.c         | 13 +++++++---
>  mm/migrate_device.c | 60 +++++++++++++++++++++++++++++++--------------
>  2 files changed, 50 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c01d68065be..bbd97d16a96a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4046,10 +4046,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			 * Get a page reference while we know the page can't be
>  			 * freed.
>  			 */
> -			get_page(vmf->page);
> -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> -			put_page(vmf->page);
> +			if (trylock_page(vmf->page)) {
> +				get_page(vmf->page);
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);

This is all beginning to look a lot like migrate_vma_collect_pmd(). So
rather than do this and then have to pass all this context
(ie. fault_page) down to the migrate_vma_* functions could we instead
just do what migrate_vma_collect_pmd() does here? Ie. we already have
the PTL and the page lock so there's no reason we couldn't just setup
the migration entry prior to calling migrate_to_ram().

Obviously calling migrate_vma_setup() would show the page as not
migrating, but drivers could easily just fill in the src_pfn info after
calling migrate_vma_setup().

This would eliminate the whole fault_page ugliness.

> +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> +				put_page(vmf->page);
> +				unlock_page(vmf->page);
> +			} else {
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> +			}
>  		} else if (is_hwpoison_entry(entry)) {
>  			ret = VM_FAULT_HWPOISON;
>  		} else if (is_pte_marker_entry(entry)) {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 6d66dc1c6ffa..049893a5a179 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  				   struct mm_walk *walk)
>  {
>  	struct migrate_vma *migrate = walk->private;
> +	struct folio *fault_folio = migrate->fault_page ?
> +		page_folio(migrate->fault_page) : NULL;
>  	struct vm_area_struct *vma = walk->vma;
>  	struct mm_struct *mm = vma->vm_mm;
>  	unsigned long addr = start, unmapped = 0;
> @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  			folio_get(folio);
>  			spin_unlock(ptl);
> -			if (unlikely(!folio_trylock(folio)))
> +			if (unlikely(fault_folio != folio &&
> +				     !folio_trylock(folio)))
>  				return migrate_vma_collect_skip(start, end,
>  								walk);
>  			ret = split_folio(folio);
> -			folio_unlock(folio);
> +			if (fault_folio != folio)
> +				folio_unlock(folio);
>  			folio_put(folio);
>  			if (ret)
>  				return migrate_vma_collect_skip(start, end,
> @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  		 * optimisation to avoid walking the rmap later with
>  		 * try_to_migrate().
>  		 */
> -		if (folio_trylock(folio)) {
> +		if (fault_folio == folio || folio_trylock(folio)) {
>  			bool anon_exclusive;
>  			pte_t swp_pte;
>  
> @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  				if (folio_try_share_anon_rmap_pte(folio, page)) {
>  					set_pte_at(mm, addr, ptep, pte);
> -					folio_unlock(folio);
> +					if (fault_folio != folio)
> +						folio_unlock(folio);
>  					folio_put(folio);
>  					mpfn = 0;
>  					goto next;
> @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  					  unsigned long npages,
>  					  struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i, restore = 0;
>  	bool allow_drain = true;
>  	unsigned long unmapped = 0;
> @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  		remove_migration_ptes(folio, folio, false);
>  
>  		src_pfns[i] = 0;
> -		folio_unlock(folio);
> +		if (fault_folio != folio)
> +			folio_unlock(folio);
>  		folio_put(folio);
>  		restore--;
>  	}
> @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>  		return -EINVAL;
>  	if (args->fault_page && !is_device_private_page(args->fault_page))
>  		return -EINVAL;
> +	if (args->fault_page && !PageLocked(args->fault_page))
> +		return -EINVAL;
>  
>  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>  	args->cpages = 0;
> @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>  }
>  EXPORT_SYMBOL(migrate_vma_pages);
>  
> -/*
> - * migrate_device_finalize() - complete page migration
> - * @src_pfns: src_pfns returned from migrate_device_range()
> - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> - * @npages: number of pages in the range
> - *
> - * Completes migration of the page by removing special migration entries.
> - * Drivers must ensure copying of page data is complete and visible to the CPU
> - * before calling this.
> - */
> -void migrate_device_finalize(unsigned long *src_pfns,
> -			unsigned long *dst_pfns, unsigned long npages)
> +static void __migrate_device_finalize(unsigned long *src_pfns,
> +				      unsigned long *dst_pfns,
> +				      unsigned long npages,
> +				      struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i;
>  
>  	for (i = 0; i < npages; i++) {
> @@ -838,7 +842,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  		src = page_folio(page);
>  		dst = page_folio(newpage);
>  		remove_migration_ptes(src, dst, false);
> -		folio_unlock(src);
> +		if (fault_folio != src)
> +			folio_unlock(src);
>  
>  		if (is_zone_device_page(page))
>  			put_page(page);
> @@ -854,6 +859,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  		}
>  	}
>  }
> +
> +/*
> + * migrate_device_finalize() - complete page migration
> + * @src_pfns: src_pfns returned from migrate_device_range()
> + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> + * @npages: number of pages in the range
> + *
> + * Completes migration of the page by removing special migration entries.
> + * Drivers must ensure copying of page data is complete and visible to the CPU
> + * before calling this.
> + */
> +void migrate_device_finalize(unsigned long *src_pfns,
> +			unsigned long *dst_pfns, unsigned long npages)
> +{
> +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> +}
>  EXPORT_SYMBOL(migrate_device_finalize);
>  
>  /**
> @@ -869,7 +890,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
>   */
>  void migrate_vma_finalize(struct migrate_vma *migrate)
>  {
> -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> +				  migrate->fault_page);
>  }
>  EXPORT_SYMBOL(migrate_vma_finalize);