It will ramdomly have the dead lock issue when test TDR: 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2. amdgpu_bo_create locked the bo's resv lock 3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv lock. v2: Make a local copy of the list Signed-off-by: Emily Deng <Emily.Deng at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index acfc63e..2b9f597 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3006,6 +3006,9 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) long r = 1; int i = 0; long tmo; + struct list_head local_shadow_list; + + INIT_LIST_HEAD(&local_shadow_list); if (amdgpu_sriov_runtime(adev)) tmo = msecs_to_jiffies(8000); @@ -3013,8 +3016,15 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) tmo = msecs_to_jiffies(100); DRM_INFO("recover vram bo from shadow start\n"); + mutex_lock(&adev->shadow_list_lock); list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { + amdgpu_bo_ref(bo); + list_add_tail(&bo->copy_shadow_list, &local_shadow_list); + } + mutex_unlock(&adev->shadow_list_lock); + + list_for_each_entry_safe(bo, tmp, &local_shadow_list, copy_shadow_list) { next = NULL; amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); if (fence) { @@ -3033,8 +3043,8 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) dma_fence_put(fence); fence = next; + amdgpu_bo_unref(&bo); } - mutex_unlock(&adev->shadow_list_lock); if (fence) { r = dma_fence_wait_timeout(fence, false, tmo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index 907fdf4..cfee16c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -103,6 +103,7 @@ struct amdgpu_bo { struct list_head mn_list; struct list_head shadow_list; }; + struct list_head copy_shadow_list; struct kgd_mem *kfd_bo; }; -- 2.7.4