Issue: In the scenario where svm_range_restore_pages is called, but svm->checkpoint_ts has not been set and the retry fault has not been drained, svm_range_unmap_from_cpu is triggered and calls svm_range_free. Meanwhile, svm_range_restore_pages continues execution and reaches svm_range_from_addr. This results in a "failed to find prange..." error, causing the page recovery to fail. How to fix: Move the timestamp check code under the protection of svm->lock. v2: Make sure all right locks are released before go out. v3: Directly goto out_unlock_svms, and return -EAGAIN. Signed-off-by: Emily Deng <Emily.Deng@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 34 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index d04725583f19..42ee49d19ee9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2917,10 +2917,13 @@ static bool svm_range_skip_recover(struct svm_range *prange) static void svm_range_count_fault(struct kfd_node *node, struct kfd_process *p, - int32_t gpuidx) + int32_t gpuidx, int r) { struct kfd_process_device *pdd; + if (r == -EAGAIN) + return; + /* fault is on different page of same range * or fault is skipped to recover later * or fault is on invalid virtual address @@ -3008,19 +3011,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } - /* check if this page fault time stamp is before svms->checkpoint_ts */ - if (svms->checkpoint_ts[gpuidx] != 0) { - if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) { - pr_debug("draining retry fault, drop fault 0x%llx\n", addr); - r = 0; - goto out; - } else - /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts - * to zero to avoid following ts wrap around give wrong comparing - */ - svms->checkpoint_ts[gpuidx] = 0; - } - if (!p->xnack_enabled) { pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); r = -EFAULT; @@ -3040,6 +3030,20 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, mmap_read_lock(mm); retry_write_locked: mutex_lock(&svms->lock); + + /* check if this page fault time stamp is before svms->checkpoint_ts */ + if (svms->checkpoint_ts[gpuidx] != 0) { + if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) { + pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + r = -EAGAIN; + goto out_unlock_svms; + } else + /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts + * to zero to avoid following ts wrap around give wrong comparing + */ + svms->checkpoint_ts[gpuidx] = 0; + } + prange = svm_range_from_addr(svms, addr, NULL); if (!prange) { pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", @@ -3165,7 +3169,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, mutex_unlock(&svms->lock); mmap_read_unlock(mm); - svm_range_count_fault(node, p, gpuidx); + svm_range_count_fault(node, p, gpuidx, r); mmput(mm); out: -- 2.34.1