Matthew Brost <matthew.brost@xxxxxxxxx> writes: > On Wed, Sep 11, 2024 at 02:53:31PM +1000, Alistair Popple wrote: >> >> Matthew Brost <matthew.brost@xxxxxxxxx> writes: >> >> I haven't seen the same in the NVIDIA UVM driver (out-of-tree, I know) > > Still a driver. Indeed, and I'm happy to answer any questions about our implementation. >> but theoretically it seems like it should be possible. However we >> serialize migrations of the same virtual address range to avoid these >> kind of issues as they can happen the other way too (ie. multiple >> threads trying to migrate to GPU). >> >> So I suspect what happens in UVM is that one thread wins and installs >> the migration entry while the others fail to get the driver migration >> lock and bail out sufficiently early in the fault path to avoid the >> live-lock. >> > > I had to try hard to show this, doubt an actual user could trigger this. > > I wrote a test which kicked 8 threads, each thread did a pthread join, > and then tried to read the same page. This repeats in loop for like 512 > pages or something. I needed an exclusive lock in migrate_to_ram vfunc > for it to livelock. Without an exclusive lock I think on average I saw > about 32k retries (i.e. migrate_to_ram calls on the same page) before a > thread won this race. > > From reading UVM, pretty sure if you tried hard enough you could trigger > a livelock given it appears you take excluvise locks in migrate_to_ram. Yes, I suspect you're correct here and that we just haven't tried hard enough to trigger it. >> > Cc: Philip Yang <Philip.Yang@xxxxxxx> >> > Cc: Felix Kuehling <felix.kuehling@xxxxxxx> >> > Cc: Christian König <christian.koenig@xxxxxxx> >> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> >> > Suggessted-by: Simona Vetter <simona.vetter@xxxxxxxx> >> > Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx> >> > --- >> > mm/memory.c | 13 +++++++--- >> > mm/migrate_device.c | 60 +++++++++++++++++++++++++++++++-------------- >> > 2 files changed, 50 insertions(+), 23 deletions(-) >> > >> > diff --git a/mm/memory.c b/mm/memory.c >> > index 3c01d68065be..bbd97d16a96a 100644 >> > --- a/mm/memory.c >> > +++ b/mm/memory.c >> > @@ -4046,10 +4046,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > * Get a page reference while we know the page can't be >> > * freed. >> > */ >> > - get_page(vmf->page); >> > - pte_unmap_unlock(vmf->pte, vmf->ptl); >> > - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); >> > - put_page(vmf->page); >> > + if (trylock_page(vmf->page)) { >> > + get_page(vmf->page); >> > + pte_unmap_unlock(vmf->pte, vmf->ptl); >> >> This is all beginning to look a lot like migrate_vma_collect_pmd(). So >> rather than do this and then have to pass all this context >> (ie. fault_page) down to the migrate_vma_* functions could we instead >> just do what migrate_vma_collect_pmd() does here? Ie. we already have >> the PTL and the page lock so there's no reason we couldn't just setup >> the migration entry prior to calling migrate_to_ram(). >> >> Obviously calling migrate_vma_setup() would show the page as not >> migrating, but drivers could easily just fill in the src_pfn info after >> calling migrate_vma_setup(). >> >> This would eliminate the whole fault_page ugliness. >> > > This seems like it would work and agree it likely be cleaner. Let me > play around with this and see what I come up with. Multi-tasking a bit > so expect a bit of delay here. > > Thanks for the input, > Matt > >> > + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); >> > + put_page(vmf->page); >> > + unlock_page(vmf->page); >> > + } else { >> > + pte_unmap_unlock(vmf->pte, vmf->ptl); >> > + } >> > } else if (is_hwpoison_entry(entry)) { >> > ret = VM_FAULT_HWPOISON; >> > } else if (is_pte_marker_entry(entry)) { >> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c >> > index 6d66dc1c6ffa..049893a5a179 100644 >> > --- a/mm/migrate_device.c >> > +++ b/mm/migrate_device.c >> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> > struct mm_walk *walk) >> > { >> > struct migrate_vma *migrate = walk->private; >> > + struct folio *fault_folio = migrate->fault_page ? >> > + page_folio(migrate->fault_page) : NULL; >> > struct vm_area_struct *vma = walk->vma; >> > struct mm_struct *mm = vma->vm_mm; >> > unsigned long addr = start, unmapped = 0; >> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> > >> > folio_get(folio); >> > spin_unlock(ptl); >> > - if (unlikely(!folio_trylock(folio))) >> > + if (unlikely(fault_folio != folio && >> > + !folio_trylock(folio))) >> > return migrate_vma_collect_skip(start, end, >> > walk); >> > ret = split_folio(folio); >> > - folio_unlock(folio); >> > + if (fault_folio != folio) >> > + folio_unlock(folio); >> > folio_put(folio); >> > if (ret) >> > return migrate_vma_collect_skip(start, end, >> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> > * optimisation to avoid walking the rmap later with >> > * try_to_migrate(). >> > */ >> > - if (folio_trylock(folio)) { >> > + if (fault_folio == folio || folio_trylock(folio)) { >> > bool anon_exclusive; >> > pte_t swp_pte; >> > >> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> > >> > if (folio_try_share_anon_rmap_pte(folio, page)) { >> > set_pte_at(mm, addr, ptep, pte); >> > - folio_unlock(folio); >> > + if (fault_folio != folio) >> > + folio_unlock(folio); >> > folio_put(folio); >> > mpfn = 0; >> > goto next; >> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, >> > unsigned long npages, >> > struct page *fault_page) >> > { >> > + struct folio *fault_folio = fault_page ? >> > + page_folio(fault_page) : NULL; >> > unsigned long i, restore = 0; >> > bool allow_drain = true; >> > unsigned long unmapped = 0; >> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, >> > remove_migration_ptes(folio, folio, false); >> > >> > src_pfns[i] = 0; >> > - folio_unlock(folio); >> > + if (fault_folio != folio) >> > + folio_unlock(folio); >> > folio_put(folio); >> > restore--; >> > } >> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args) >> > return -EINVAL; >> > if (args->fault_page && !is_device_private_page(args->fault_page)) >> > return -EINVAL; >> > + if (args->fault_page && !PageLocked(args->fault_page)) >> > + return -EINVAL; >> > >> > memset(args->src, 0, sizeof(*args->src) * nr_pages); >> > args->cpages = 0; >> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) >> > } >> > EXPORT_SYMBOL(migrate_vma_pages); >> > >> > -/* >> > - * migrate_device_finalize() - complete page migration >> > - * @src_pfns: src_pfns returned from migrate_device_range() >> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to >> > - * @npages: number of pages in the range >> > - * >> > - * Completes migration of the page by removing special migration entries. >> > - * Drivers must ensure copying of page data is complete and visible to the CPU >> > - * before calling this. >> > - */ >> > -void migrate_device_finalize(unsigned long *src_pfns, >> > - unsigned long *dst_pfns, unsigned long npages) >> > +static void __migrate_device_finalize(unsigned long *src_pfns, >> > + unsigned long *dst_pfns, >> > + unsigned long npages, >> > + struct page *fault_page) >> > { >> > + struct folio *fault_folio = fault_page ? >> > + page_folio(fault_page) : NULL; >> > unsigned long i; >> > >> > for (i = 0; i < npages; i++) { >> > @@ -838,7 +842,8 @@ void migrate_device_finalize(unsigned long *src_pfns, >> > src = page_folio(page); >> > dst = page_folio(newpage); >> > remove_migration_ptes(src, dst, false); >> > - folio_unlock(src); >> > + if (fault_folio != src) >> > + folio_unlock(src); >> > >> > if (is_zone_device_page(page)) >> > put_page(page); >> > @@ -854,6 +859,22 @@ void migrate_device_finalize(unsigned long *src_pfns, >> > } >> > } >> > } >> > + >> > +/* >> > + * migrate_device_finalize() - complete page migration >> > + * @src_pfns: src_pfns returned from migrate_device_range() >> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to >> > + * @npages: number of pages in the range >> > + * >> > + * Completes migration of the page by removing special migration entries. >> > + * Drivers must ensure copying of page data is complete and visible to the CPU >> > + * before calling this. >> > + */ >> > +void migrate_device_finalize(unsigned long *src_pfns, >> > + unsigned long *dst_pfns, unsigned long npages) >> > +{ >> > + return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL); >> > +} >> > EXPORT_SYMBOL(migrate_device_finalize); >> > >> > /** >> > @@ -869,7 +890,8 @@ EXPORT_SYMBOL(migrate_device_finalize); >> > */ >> > void migrate_vma_finalize(struct migrate_vma *migrate) >> > { >> > - migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); >> > + __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages, >> > + migrate->fault_page); >> > } >> > EXPORT_SYMBOL(migrate_vma_finalize); >>