On 2019-10-29 3:25 p.m., Jason Gunthorpe wrote: > On Tue, Oct 29, 2019 at 07:22:37PM +0000, Yang, Philip wrote: >> Hi Jason, >> >> I did quick test after merging amd-staging-drm-next with the >> mmu_notifier branch, which includes this set changes. The test result >> has different failures, app stuck intermittently, GUI no display etc. I >> am understanding the changes and will try to figure out the cause. > > Thanks! I'm not surprised by this given how difficult this patch was > to make. Let me know if I can assist in any way > > Please ensure to run with lockdep enabled.. Your symptops sounds sort > of like deadlocking? > Hi Jason, Attached patch fix several issues in amdgpu driver, maybe you can squash this into patch 14. With this is done, patch 12, 13, 14 is Reviewed-by and Tested-by Philip Yang <philip.yang@xxxxxxx> Regards, Philip > Regards, > Jason >
From 5a0bd4d8cef8472fe2904550142d288feed8cd81 Mon Sep 17 00:00:00 2001 From: Philip Yang <Philip.Yang@xxxxxxx> Date: Thu, 31 Oct 2019 09:10:30 -0400 Subject: [PATCH] drm/amdgpu: issues with new mmu_range_notifier api put mmu_range_set_seq under the same lock which is used to call mmu_range_read_retry. fix amdgpu_ttm_tt_get_user_pages_done return value, because mmu_range_read_retry means !hmm_range_valid retry if hmm_range_fault return -EBUSY fix false WARN for missing get_user_page_done, we should check all pages not just the first page, don't understand why this issue is triggered by this change. Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 32 +++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 37 +++++++++++++++++-------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index cb718a064eb4..c8bbd06f1009 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -67,21 +67,15 @@ static bool amdgpu_mn_invalidate_gfx(struct mmu_range_notifier *mrn, struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); long r; - /* - * FIXME: Must hold some lock shared with - * amdgpu_ttm_tt_get_user_pages_done() - */ - mmu_range_set_seq(mrn, cur_seq); + mutex_lock(&adev->notifier_lock); - /* FIXME: Is this necessary? */ - if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, range->start, - range->end)) - return true; + mmu_range_set_seq(mrn, cur_seq); - if (!mmu_notifier_range_blockable(range)) + if (!mmu_notifier_range_blockable(range)) { + mutex_unlock(&adev->notifier_lock); return false; + } - mutex_lock(&adev->notifier_lock); r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false, MAX_SCHEDULE_TIMEOUT); mutex_unlock(&adev->notifier_lock); @@ -110,21 +104,15 @@ static bool amdgpu_mn_invalidate_hsa(struct mmu_range_notifier *mrn, struct amdgpu_bo *bo = container_of(mrn, struct amdgpu_bo, notifier); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - /* - * FIXME: Must hold some lock shared with - * amdgpu_ttm_tt_get_user_pages_done() - */ - mmu_range_set_seq(mrn, cur_seq); + mutex_lock(&adev->notifier_lock); - /* FIXME: Is this necessary? */ - if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, range->start, - range->end)) - return true; + mmu_range_set_seq(mrn, cur_seq); - if (!mmu_notifier_range_blockable(range)) + if (!mmu_notifier_range_blockable(range)) { + mutex_unlock(&adev->notifier_lock); return false; + } - mutex_lock(&adev->notifier_lock); amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm); mutex_unlock(&adev->notifier_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index a38437fd290a..56fde43d5efa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -799,10 +799,11 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) { struct ttm_tt *ttm = bo->tbo.ttm; struct amdgpu_ttm_tt *gtt = (void *)ttm; - struct mm_struct *mm; - struct hmm_range *range; unsigned long start = gtt->userptr; struct vm_area_struct *vma; + struct hmm_range *range; + unsigned long timeout; + struct mm_struct *mm; unsigned long i; int r = 0; @@ -841,8 +842,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) goto out_free_ranges; } - range->notifier_seq = mmu_range_read_begin(&bo->notifier); - down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (unlikely(!vma || start < vma->vm_start)) { @@ -854,12 +853,20 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) r = -EPERM; goto out_unlock; } + up_read(&mm->mmap_sem); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); + +retry: + range->notifier_seq = mmu_range_read_begin(&bo->notifier); + down_read(&mm->mmap_sem); r = hmm_range_fault(range, 0); up_read(&mm->mmap_sem); - - if (unlikely(r < 0)) + if (unlikely(r <= 0)) { + if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout)) + goto retry; goto out_free_pfns; + } for (i = 0; i < ttm->num_pages; i++) { pages[i] = hmm_device_entry_to_page(range, range->pfns[i]); @@ -916,7 +923,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) gtt->range = NULL; } - return r; + return !r; } #endif @@ -997,10 +1004,18 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm) sg_free_table(ttm->sg); #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR) - if (gtt->range && - ttm->pages[0] == hmm_device_entry_to_page(gtt->range, - gtt->range->pfns[0])) - WARN_ONCE(1, "Missing get_user_page_done\n"); + if (gtt->range) { + unsigned long i; + + for (i = 0; i < ttm->num_pages; i++) { + if (ttm->pages[i] != + hmm_device_entry_to_page(gtt->range, + gtt->range->pfns[i])) + break; + } + + WARN((i == ttm->num_pages), "Missing get_user_page_done\n"); + } #endif } -- 2.17.1