[PATCH 2/4] drm/amdgpu: cleanups for vram lost handling

alexdeucher@xxxxxxxxx (Alex Deucher) · Wed, 28 Feb 2018 09:25:19 -0500

On Wed, Feb 28, 2018 at 2:21 AM, Monk Liu <Monk.Liu at amd.com> wrote:
> 1)create a routine "handle_vram_lost" to do the vram
> recovery, and put it into amdgpu_device_reset/reset_sriov,
> this way no need of the extra paramter to hold the
> VRAM LOST information and the related macros can be removed.
>
> 3)show vram_recover failure if time out, and set TMO equal to
> lockup_timeout if vram_recover is under SRIOV runtime mode.
>
> 4)report error if any ip reset failed for SR-IOV
>
> Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 136 +++++++++++++++--------------
>  2 files changed, 72 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5bddfc1..abbc3f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -181,10 +181,6 @@ extern int amdgpu_cik_support;
>  #define CIK_CURSOR_WIDTH 128
>  #define CIK_CURSOR_HEIGHT 128
>
> -/* GPU RESET flags */
> -#define AMDGPU_RESET_INFO_VRAM_LOST  (1 << 0)
> -#define AMDGPU_RESET_INFO_FULLRESET  (1 << 1)
> -
>  struct amdgpu_device;
>  struct amdgpu_ib;
>  struct amdgpu_cs_parser;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e9d81a8..39ece7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1591,6 +1591,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
>
>                         r = block->version->funcs->hw_init(adev);
>                         DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed");
> +                       if (r)
> +                               return r;
>                 }
>         }
>
> @@ -1624,6 +1626,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
>
>                         r = block->version->funcs->hw_init(adev);
>                         DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed");
> +                       if (r)
> +                               return r;
>                 }
>         }
>
> @@ -2471,17 +2475,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>         return r;
>  }
>
> +static int amdgpu_handle_vram_lost(struct amdgpu_device *adev)

Please rename this function to amdgpu_device_handle_vram_lost() for
consistency.with other functions in the file.

Alex

> +{
> +       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> +       struct amdgpu_bo *bo, *tmp;
> +       struct dma_fence *fence = NULL, *next = NULL;
> +       long r = 1;
> +       int i = 0;
> +       long tmo;
> +
> +       if (amdgpu_sriov_runtime(adev))
> +               tmo = msecs_to_jiffies(amdgpu_lockup_timeout);
> +       else
> +               tmo = msecs_to_jiffies(100);
> +
> +       DRM_INFO("recover vram bo from shadow start\n");
> +       mutex_lock(&adev->shadow_list_lock);
> +       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> +               next = NULL;
> +               amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
> +               if (fence) {
> +                       r = dma_fence_wait_timeout(fence, false, tmo);
> +                       if (r == 0)
> +                               pr_err("wait fence %p[%d] timeout\n", fence, i);
> +                       else if (r < 0)
> +                               pr_err("wait fence %p[%d] interrupted\n", fence, i);
> +                       if (r < 1) {
> +                               dma_fence_put(fence);
> +                               fence = next;
> +                               break;
> +                       }
> +                       i++;
> +               }
> +
> +               dma_fence_put(fence);
> +               fence = next;
> +       }
> +       mutex_unlock(&adev->shadow_list_lock);
> +
> +       if (fence) {
> +               r = dma_fence_wait_timeout(fence, false, tmo);
> +               if (r == 0)
> +                       pr_err("wait fence %p[%d] timeout\n", fence, i);
> +               else if (r < 0)
> +                       pr_err("wait fence %p[%d] interrupted\n", fence, i);
> +
> +       }
> +       dma_fence_put(fence);
> +
> +       if (r > 0)
> +               DRM_INFO("recover vram bo from shadow done\n");
> +       else
> +               DRM_ERROR("recover vram bo from shadow failed\n");
> +
> +       return (r > 0?0:1);
> +}
> +
>  /*
>   * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
>   *
>   * @adev: amdgpu device pointer
> - * @reset_flags: output param tells caller the reset result
>   *
>   * attempt to do soft-reset or full-reset and reinitialize Asic
>   * return 0 means successed otherwise failed
>  */
> -static int amdgpu_device_reset(struct amdgpu_device *adev,
> -                              uint64_t* reset_flags)
> +static int amdgpu_device_reset(struct amdgpu_device *adev)
>  {
>         bool need_full_reset, vram_lost = 0;
>         int r;
> @@ -2496,7 +2554,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>                         DRM_INFO("soft reset failed, will fallback to full reset!\n");
>                         need_full_reset = true;
>                 }
> -
>         }
>
>         if (need_full_reset) {
> @@ -2545,13 +2602,8 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>                 }
>         }
>
> -       if (reset_flags) {
> -               if (vram_lost)
> -                       (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
> -
> -               if (need_full_reset)
> -                       (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
> -       }
> +       if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost))
> +               r = amdgpu_handle_vram_lost(adev);
>
>         return r;
>  }
> @@ -2560,14 +2612,11 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>   * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
>   *
>   * @adev: amdgpu device pointer
> - * @reset_flags: output param tells caller the reset result
>   *
>   * do VF FLR and reinitialize Asic
>   * return 0 means successed otherwise failed
>  */
> -static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
> -                                    uint64_t *reset_flags,
> -                                    bool from_hypervisor)
> +static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor)
>  {
>         int r;
>
> @@ -2588,28 +2637,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>
>         /* now we are okay to resume SMC/CP/SDMA */
>         r = amdgpu_device_ip_reinit_late_sriov(adev);
> +       amdgpu_virt_release_full_gpu(adev, true);
>         if (r)
>                 goto error;
>
>         amdgpu_irq_gpu_reset_resume_helper(adev);
>         r = amdgpu_ib_ring_tests(adev);
> -       if (r)
> -               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
>
> -error:
> -       /* release full control of GPU after ib test */
> -       amdgpu_virt_release_full_gpu(adev, true);
> -
> -       if (reset_flags) {
> -               if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
> -                       (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
> -                       atomic_inc(&adev->vram_lost_counter);
> -               }
> -
> -               /* VF FLR or hotlink reset is always full-reset */
> -               (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
> +       if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
> +               atomic_inc(&adev->vram_lost_counter);
> +               r = amdgpu_handle_vram_lost(adev);
>         }
>
> +error:
> +
>         return r;
>  }
>
> @@ -2673,42 +2714,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         }
>
>         if (amdgpu_sriov_vf(adev))
> -               r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true);
> +               r = amdgpu_device_reset_sriov(adev, job ? false : true);
>         else
> -               r = amdgpu_device_reset(adev, &reset_flags);
> -
> -       if (!r) {
> -               if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
> -                       (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
> -                       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> -                       struct amdgpu_bo *bo, *tmp;
> -                       struct dma_fence *fence = NULL, *next = NULL;
> -
> -                       DRM_INFO("recover vram bo from shadow\n");
> -                       mutex_lock(&adev->shadow_list_lock);
> -                       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> -                               next = NULL;
> -                               amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
> -                               if (fence) {
> -                                       r = dma_fence_wait(fence, false);
> -                                       if (r) {
> -                                               WARN(r, "recovery from shadow isn't completed\n");
> -                                               break;
> -                                       }
> -                               }
> -
> -                               dma_fence_put(fence);
> -                               fence = next;
> -                       }
> -                       mutex_unlock(&adev->shadow_list_lock);
> -                       if (fence) {
> -                               r = dma_fence_wait(fence, false);
> -                               if (r)
> -                                       WARN(r, "recovery from shadow isn't completed\n");
> -                       }
> -                       dma_fence_put(fence);
> -               }
> -       }
> +               r = amdgpu_device_reset(adev);
>
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx