Patch look good to me then. Acked-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> Andrey On 02/28/2018 08:36 AM, Liu, Monk wrote: >> goto error; >> >> amdgpu_irq_gpu_reset_resume_helper(adev); >> r = amdgpu_ib_ring_tests(adev); >> - if (r) >> - dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); > This 2 lines deletion seems unintentional > > No, intentional, because if IB TEST failed the error will be reported by the fault IP > > >> + if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { >> + atomic_inc(&adev->vram_lost_counter); >> + r = amdgpu_handle_vram_lost(adev); >> } >> >> +error: >> + >> return r; > Not doing incrementation of VRAM lost counter in case of error anymore is intentional ? > > > [ML] yeah, no need to increase the counter because GPU reset is failed so everything is meaningless, driver cannot used anymore > > > > -----Original Message----- > From: Grodzovsky, Andrey > Sent: 2018å¹´2æ??28æ?¥ 21:29 > To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org > Subject: Re: [PATCH 2/4] drm/amdgpu: cleanups for vram lost handling > > > > On 02/28/2018 02:21 AM, Monk Liu wrote: >> 1)create a routine "handle_vram_lost" to do the vram recovery, and put >> it into amdgpu_device_reset/reset_sriov, this way no need of the extra >> paramter to hold the VRAM LOST information and the related macros can >> be removed. >> >> 3)show vram_recover failure if time out, and set TMO equal to >> lockup_timeout if vram_recover is under SRIOV runtime mode. >> >> 4)report error if any ip reset failed for SR-IOV >> >> Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c >> Signed-off-by: Monk Liu <Monk.Liu at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 - >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 136 +++++++++++++++-------------- >> 2 files changed, 72 insertions(+), 68 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 5bddfc1..abbc3f1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -181,10 +181,6 @@ extern int amdgpu_cik_support; >> #define CIK_CURSOR_WIDTH 128 >> #define CIK_CURSOR_HEIGHT 128 >> >> -/* GPU RESET flags */ >> -#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0) -#define >> AMDGPU_RESET_INFO_FULLRESET (1 << 1) >> - >> struct amdgpu_device; >> struct amdgpu_ib; >> struct amdgpu_cs_parser; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index e9d81a8..39ece7f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -1591,6 +1591,8 @@ static int >> amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) >> >> r = block->version->funcs->hw_init(adev); >> DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, >> r?"failed":"successed"); >> + if (r) >> + return r; >> } >> } >> >> @@ -1624,6 +1626,8 @@ static int >> amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) >> >> r = block->version->funcs->hw_init(adev); >> DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, >> r?"failed":"successed"); >> + if (r) >> + return r; >> } >> } >> >> @@ -2471,17 +2475,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev, >> return r; >> } >> >> +static int amdgpu_handle_vram_lost(struct amdgpu_device *adev) { >> + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; >> + struct amdgpu_bo *bo, *tmp; >> + struct dma_fence *fence = NULL, *next = NULL; >> + long r = 1; >> + int i = 0; >> + long tmo; >> + >> + if (amdgpu_sriov_runtime(adev)) >> + tmo = msecs_to_jiffies(amdgpu_lockup_timeout); >> + else >> + tmo = msecs_to_jiffies(100); >> + >> + DRM_INFO("recover vram bo from shadow start\n"); >> + mutex_lock(&adev->shadow_list_lock); >> + list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { >> + next = NULL; >> + amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); >> + if (fence) { >> + r = dma_fence_wait_timeout(fence, false, tmo); >> + if (r == 0) >> + pr_err("wait fence %p[%d] timeout\n", fence, i); >> + else if (r < 0) >> + pr_err("wait fence %p[%d] interrupted\n", fence, i); >> + if (r < 1) { >> + dma_fence_put(fence); >> + fence = next; >> + break; >> + } >> + i++; >> + } >> + >> + dma_fence_put(fence); >> + fence = next; >> + } >> + mutex_unlock(&adev->shadow_list_lock); >> + >> + if (fence) { >> + r = dma_fence_wait_timeout(fence, false, tmo); >> + if (r == 0) >> + pr_err("wait fence %p[%d] timeout\n", fence, i); >> + else if (r < 0) >> + pr_err("wait fence %p[%d] interrupted\n", fence, i); >> + >> + } >> + dma_fence_put(fence); >> + >> + if (r > 0) >> + DRM_INFO("recover vram bo from shadow done\n"); >> + else >> + DRM_ERROR("recover vram bo from shadow failed\n"); >> + >> + return (r > 0?0:1); >> +} >> + >> /* >> * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough >> * >> * @adev: amdgpu device pointer >> - * @reset_flags: output param tells caller the reset result >> * >> * attempt to do soft-reset or full-reset and reinitialize Asic >> * return 0 means successed otherwise failed >> */ >> -static int amdgpu_device_reset(struct amdgpu_device *adev, >> - uint64_t* reset_flags) >> +static int amdgpu_device_reset(struct amdgpu_device *adev) >> { >> bool need_full_reset, vram_lost = 0; >> int r; >> @@ -2496,7 +2554,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, >> DRM_INFO("soft reset failed, will fallback to full reset!\n"); >> need_full_reset = true; >> } >> - >> } >> >> if (need_full_reset) { >> @@ -2545,13 +2602,8 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, >> } >> } >> >> - if (reset_flags) { >> - if (vram_lost) >> - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; >> - >> - if (need_full_reset) >> - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; >> - } >> + if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) >> + r = amdgpu_handle_vram_lost(adev); >> >> return r; >> } >> @@ -2560,14 +2612,11 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, >> * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf >> * >> * @adev: amdgpu device pointer >> - * @reset_flags: output param tells caller the reset result >> * >> * do VF FLR and reinitialize Asic >> * return 0 means successed otherwise failed >> */ >> -static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, >> - uint64_t *reset_flags, >> - bool from_hypervisor) >> +static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool >> +from_hypervisor) >> { >> int r; >> >> @@ -2588,28 +2637,20 @@ static int amdgpu_device_reset_sriov(struct >> amdgpu_device *adev, >> >> /* now we are okay to resume SMC/CP/SDMA */ >> r = amdgpu_device_ip_reinit_late_sriov(adev); >> + amdgpu_virt_release_full_gpu(adev, true); >> if (r) >> goto error; >> >> amdgpu_irq_gpu_reset_resume_helper(adev); >> r = amdgpu_ib_ring_tests(adev); >> - if (r) >> - dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); > This 2 lines deletion seems unintentional > >> >> -error: >> - /* release full control of GPU after ib test */ >> - amdgpu_virt_release_full_gpu(adev, true); >> - >> - if (reset_flags) { >> - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { >> - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; >> - atomic_inc(&adev->vram_lost_counter); >> - } >> - >> - /* VF FLR or hotlink reset is always full-reset */ >> - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; >> + if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { >> + atomic_inc(&adev->vram_lost_counter); >> + r = amdgpu_handle_vram_lost(adev); >> } >> >> +error: >> + >> return r; > Not doing incrementation of VRAM lost counter in case of error anymore is intentional ? > > Andrey >> } >> >> @@ -2673,42 +2714,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >> } >> >> if (amdgpu_sriov_vf(adev)) >> - r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); >> + r = amdgpu_device_reset_sriov(adev, job ? false : true); >> else >> - r = amdgpu_device_reset(adev, &reset_flags); >> - >> - if (!r) { >> - if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || >> - (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { >> - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; >> - struct amdgpu_bo *bo, *tmp; >> - struct dma_fence *fence = NULL, *next = NULL; >> - >> - DRM_INFO("recover vram bo from shadow\n"); >> - mutex_lock(&adev->shadow_list_lock); >> - list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { >> - next = NULL; >> - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); >> - if (fence) { >> - r = dma_fence_wait(fence, false); >> - if (r) { >> - WARN(r, "recovery from shadow isn't completed\n"); >> - break; >> - } >> - } >> - >> - dma_fence_put(fence); >> - fence = next; >> - } >> - mutex_unlock(&adev->shadow_list_lock); >> - if (fence) { >> - r = dma_fence_wait(fence, false); >> - if (r) >> - WARN(r, "recovery from shadow isn't completed\n"); >> - } >> - dma_fence_put(fence); >> - } >> - } >> + r = amdgpu_device_reset(adev); >> >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i];