ã??As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures. please name at least one, I'll check and see how to improve and like I said, this approach is correct and verified by hang test, if internal structures messed up I don't think the test will easy pass. hw_init() just call resume per engine. you can take a deep look into sriov_gpu_reset and judge later ________________________________ å??件人: Christian König <deathsimple at vodafone.de> å??é??æ?¶é?´: 2017å¹´2æ??8æ?¥ 18:49:57 æ?¶ä»¶äºº: Liu, Monk; amd-gfx at lists.freedesktop.org 主é¢?: Re: SPAM //ç?å¤?: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset + /* now we are okay to resume SMC/CP/SDMA */ + amdgpu_resume_late(adev); As I wrote in the other thread as well calling amdgpu_resume() without proper suspend will just mess up a whole bunch of internal structures. So a clear NAK on that approach. If you don't need the hw stop which amdgpu_suspend() does for SRIOV then please try to just use the hw_init() callback and not the resume() callback. Regards, Christian. Am 07.02.2017 um 07:26 schrieb Liu, Monk: patch 1-8 are some fixes for sriov gpu reset feature patch 9 -20 are for sriov gpu reset BR Monk ________________________________ å??件人: amd-gfx <amd-gfx-bounces at lists.freedesktop.org><mailto:amd-gfx-bounces at lists.freedesktop.org> 代表 Monk Liu <Monk.Liu at amd.com><mailto:Monk.Liu at amd.com> å??é??æ?¶é?´: 2017å¹´2æ??7æ?¥ 14:11:07 æ?¶ä»¶äºº: amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org> æ??é??: Liu, Monk 主é¢?: [PATCH 09/20] drm/amdgpu:implement SRIOV gpu_reset Signed-off-by: Monk Liu <Monk.Liu at amd.com><mailto:Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 158 ++++++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + 2 files changed, 158 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e926f84..2b404ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1604,6 +1604,53 @@ int amdgpu_suspend(struct amdgpu_device *adev) return 0; } +static int amdgpu_resume_early(struct amdgpu_device *adev) +{ + int i, r; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) + r = adev->ip_blocks[i].version->funcs->resume(adev); + + if (r) { + DRM_ERROR("resume of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; + } + } + + return 0; +} + +static int amdgpu_resume_late(struct amdgpu_device *adev) +{ + int i, r; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ) + continue; + + r = adev->ip_blocks[i].version->funcs->resume(adev); + if (r) { + DRM_ERROR("resume of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; + } + } + + return 0; +} + static int amdgpu_resume(struct amdgpu_device *adev) { int i, r; @@ -2343,6 +2390,115 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev, } /** + * amdgpu_sriov_gpu_reset - reset the asic + * + * @adev: amdgpu device pointer + * @voluntary: if this reset is requested by guest. + * (true means by guest and false means by HYPERVISOR ) + * + * Attempt the reset the GPU if it has hung (all asics). + * for SRIOV case. + * Returns 0 for success or an error on failure. + */ +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) +{ + int i, r = 0; + int resched; + struct amdgpu_bo *bo, *tmp; + struct amdgpu_ring *ring; + struct fence *fence = NULL, *next = NULL; + + mutex_lock(&adev->virt.lock_reset); + atomic_inc(&adev->gpu_reset_counter); + + /* block TTM */ + resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); + + /* block scheduler */ + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + kthread_park(ring->sched.thread); + amd_sched_hw_job_reset(&ring->sched); + } + + /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ + amdgpu_fence_driver_force_completion(adev); + + /* request to take full control of GPU before re-initialization */ + if (voluntary) + amdgpu_virt_reset_gpu(adev); + else + amdgpu_virt_request_full_gpu(adev, true); + + + /* Resume IP prior to SMC */ + amdgpu_resume_early(adev); + + /* we need recover gart prior to run SMC/CP/SDMA resume */ + amdgpu_ttm_recover_gart(adev); + + /* now we are okay to resume SMC/CP/SDMA */ + amdgpu_resume_late(adev); + + amdgpu_irq_gpu_reset_resume_helper(adev); + + if (amdgpu_ib_ring_tests(adev)) + dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); + + /* rellease full control of GPU after ib test */ + amdgpu_virt_release_full_gpu(adev, true); + + DRM_INFO("recover vram bo from shadow\n"); + + ring = adev->mman.buffer_funcs_ring; + mutex_lock(&adev->shadow_list_lock); + list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { + amdgpu_recover_vram_from_shadow(adev, ring, bo, &next); + if (fence) { + r = fence_wait(fence, false); + if (r) { + WARN(r, "recovery from shadow isn't completed\n"); + break; + } + } + + fence_put(fence); + fence = next; + } + mutex_unlock(&adev->shadow_list_lock); + + if (fence) { + r = fence_wait(fence, false); + if (r) + WARN(r, "recovery from shadow isn't completed\n"); + } + fence_put(fence); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + if (!ring || !ring->sched.thread) + continue; + + amd_sched_job_recovery(&ring->sched); + kthread_unpark(ring->sched.thread); + } + + drm_helper_resume_force_mode(adev->ddev); + ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); + if (r) { + /* bad news, how to tell it to userspace ? */ + dev_info(adev->dev, "GPU reset failed\n"); + } + + mutex_unlock(&adev->virt.lock_reset); + return r; +} + +/** * amdgpu_gpu_reset - reset the asic * * @adev: amdgpu device pointer @@ -2358,7 +2514,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) bool need_full_reset; if (amdgpu_sriov_vf(adev)) - return 0; + return amdgpu_sriov_gpu_reset(adev, true); if (!amdgpu_check_soft_reset(adev)) { DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 675e12c..73d24df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v); int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary); #endif -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org> https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org> https://lists.freedesktop.org/mailman/listinfo/amd-gfx -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20170208/941e5b37/attachment-0001.html>