On 02/28/2018 02:21 AM, Monk Liu wrote: > 1)create a routine "handle_vram_lost" to do the vram > recovery, and put it into amdgpu_device_reset/reset_sriov, > this way no need of the extra paramter to hold the > VRAM LOST information and the related macros can be removed. > > 3)show vram_recover failure if time out, and set TMO equal to > lockup_timeout if vram_recover is under SRIOV runtime mode. > > 4)report error if any ip reset failed for SR-IOV > > Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 - > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 136 +++++++++++++++-------------- > 2 files changed, 72 insertions(+), 68 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 5bddfc1..abbc3f1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -181,10 +181,6 @@ extern int amdgpu_cik_support; > #define CIK_CURSOR_WIDTH 128 > #define CIK_CURSOR_HEIGHT 128 > > -/* GPU RESET flags */ > -#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0) > -#define AMDGPU_RESET_INFO_FULLRESET (1 << 1) > - > struct amdgpu_device; > struct amdgpu_ib; > struct amdgpu_cs_parser; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index e9d81a8..39ece7f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1591,6 +1591,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) > > r = block->version->funcs->hw_init(adev); > DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); > + if (r) > + return r; > } > } > > @@ -1624,6 +1626,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) > > r = block->version->funcs->hw_init(adev); > DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); > + if (r) > + return r; > } > } > > @@ -2471,17 +2475,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev, > return r; > } > > +static int amdgpu_handle_vram_lost(struct amdgpu_device *adev) > +{ > + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > + struct amdgpu_bo *bo, *tmp; > + struct dma_fence *fence = NULL, *next = NULL; > + long r = 1; > + int i = 0; > + long tmo; > + > + if (amdgpu_sriov_runtime(adev)) > + tmo = msecs_to_jiffies(amdgpu_lockup_timeout); > + else > + tmo = msecs_to_jiffies(100); > + > + DRM_INFO("recover vram bo from shadow start\n"); > + mutex_lock(&adev->shadow_list_lock); > + list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { > + next = NULL; > + amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); > + if (fence) { > + r = dma_fence_wait_timeout(fence, false, tmo); > + if (r == 0) > + pr_err("wait fence %p[%d] timeout\n", fence, i); > + else if (r < 0) > + pr_err("wait fence %p[%d] interrupted\n", fence, i); > + if (r < 1) { > + dma_fence_put(fence); > + fence = next; > + break; > + } > + i++; > + } > + > + dma_fence_put(fence); > + fence = next; > + } > + mutex_unlock(&adev->shadow_list_lock); > + > + if (fence) { > + r = dma_fence_wait_timeout(fence, false, tmo); > + if (r == 0) > + pr_err("wait fence %p[%d] timeout\n", fence, i); > + else if (r < 0) > + pr_err("wait fence %p[%d] interrupted\n", fence, i); > + > + } > + dma_fence_put(fence); > + > + if (r > 0) > + DRM_INFO("recover vram bo from shadow done\n"); > + else > + DRM_ERROR("recover vram bo from shadow failed\n"); > + > + return (r > 0?0:1); > +} > + > /* > * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough > * > * @adev: amdgpu device pointer > - * @reset_flags: output param tells caller the reset result > * > * attempt to do soft-reset or full-reset and reinitialize Asic > * return 0 means successed otherwise failed > */ > -static int amdgpu_device_reset(struct amdgpu_device *adev, > - uint64_t* reset_flags) > +static int amdgpu_device_reset(struct amdgpu_device *adev) > { > bool need_full_reset, vram_lost = 0; > int r; > @@ -2496,7 +2554,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, > DRM_INFO("soft reset failed, will fallback to full reset!\n"); > need_full_reset = true; > } > - > } > > if (need_full_reset) { > @@ -2545,13 +2602,8 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, > } > } > > - if (reset_flags) { > - if (vram_lost) > - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; > - > - if (need_full_reset) > - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; > - } > + if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) > + r = amdgpu_handle_vram_lost(adev); > > return r; > } > @@ -2560,14 +2612,11 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, > * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf > * > * @adev: amdgpu device pointer > - * @reset_flags: output param tells caller the reset result > * > * do VF FLR and reinitialize Asic > * return 0 means successed otherwise failed > */ > -static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > - uint64_t *reset_flags, > - bool from_hypervisor) > +static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) > { > int r; > > @@ -2588,28 +2637,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > > /* now we are okay to resume SMC/CP/SDMA */ > r = amdgpu_device_ip_reinit_late_sriov(adev); > + amdgpu_virt_release_full_gpu(adev, true); > if (r) > goto error; > > amdgpu_irq_gpu_reset_resume_helper(adev); > r = amdgpu_ib_ring_tests(adev); > - if (r) > - dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); This 2 lines deletion seems unintentional > > -error: > - /* release full control of GPU after ib test */ > - amdgpu_virt_release_full_gpu(adev, true); > - > - if (reset_flags) { > - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { > - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; > - atomic_inc(&adev->vram_lost_counter); > - } > - > - /* VF FLR or hotlink reset is always full-reset */ > - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; > + if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { > + atomic_inc(&adev->vram_lost_counter); > + r = amdgpu_handle_vram_lost(adev); > } > > +error: > + > return r; Not doing incrementation of VRAM lost counter in case of error anymore is intentional ? Andrey > } > > @@ -2673,42 +2714,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > } > > if (amdgpu_sriov_vf(adev)) > - r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); > + r = amdgpu_device_reset_sriov(adev, job ? false : true); > else > - r = amdgpu_device_reset(adev, &reset_flags); > - > - if (!r) { > - if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || > - (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { > - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > - struct amdgpu_bo *bo, *tmp; > - struct dma_fence *fence = NULL, *next = NULL; > - > - DRM_INFO("recover vram bo from shadow\n"); > - mutex_lock(&adev->shadow_list_lock); > - list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { > - next = NULL; > - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); > - if (fence) { > - r = dma_fence_wait(fence, false); > - if (r) { > - WARN(r, "recovery from shadow isn't completed\n"); > - break; > - } > - } > - > - dma_fence_put(fence); > - fence = next; > - } > - mutex_unlock(&adev->shadow_list_lock); > - if (fence) { > - r = dma_fence_wait(fence, false); > - if (r) > - WARN(r, "recovery from shadow isn't completed\n"); > - } > - dma_fence_put(fence); > - } > - } > + r = amdgpu_device_reset(adev); > > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = adev->rings[i];