RE: [PATCH Review 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Mon, 18 Dec 2023 09:30:10 +0000

[AMD Official Use Only - General]

For mode2 reset, only call SDMA/GFX suspend to disable SDMA/GFX ecc_irq, driver just need enable SDMA/GFX ecc_irq during resume process.
Think about below scenario on aqua vanjaram, user modprobe amdgpu with reset_method=3, driver will do GPU recovery if the SDMA uncorrectable error is triggered,
It's difficult to distinguish whether need resume gmc ecc_irq, nbio ras_controller_irq, nbio ras_err_event_athub_irq since driver do full gpu reset.

Regards,
Stanley
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Sent: Monday, December 18, 2023 3:03 PM
> To: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Yang, Stanley <Stanley.Yang@xxxxxxx>
> Subject: RE: [PATCH Review 1/1] drm/amdgpu: Fix ecc irq enable/disable
> unpaired
>
> [AMD Official Use Only - General]
>
> Can we put the irq resume in amdgpu_ras_resume?
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
> Stanley.Yang
> Sent: Saturday, December 16, 2023 00:50
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Yang, Stanley <Stanley.Yang@xxxxxxx>
> Subject: [PATCH Review 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
>
> The ecc_irq is disabled while GPU mode2 reset suspending process, but not be
> enabled during GPU mode2 reset resume process.
>
> Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/aldebaran.c  |  6 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37
> +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  3 ++
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  |  4 +++
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  3 ++
>  6 files changed, 54 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..ba9238a93064 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -358,6 +358,12 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
>                 /* Resume RAS */
>                 amdgpu_ras_resume(tmp_adev);
>
> +               r = amdgpu_ras_late_resume(tmp_adev);
> +               if (r) {
> +                       dev_err(tmp_adev->dev, "amdgpu_ras_late_resume
> failed %d\n", r);
> +                       goto end;
> +               }
> +
>                 /* Update PSP FW topology after reset */
>                 if (reset_context->hive &&
>                     tmp_adev->gmc.xgmi.num_physical_nodes > 1) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8a04fb6c7c1f..318e77c493f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3164,6 +3164,43 @@ int amdgpu_ras_late_init(struct amdgpu_device
> *adev)
>         return 0;
>  }
>
> +/* Handle mode 2 reset, resume ecc irq state */ int
> +amdgpu_ras_late_resume(struct amdgpu_device *adev) {
> +       struct amdgpu_ras_block_list *node, *tmp;
> +       struct amdgpu_ras_block_object *obj;
> +       int r;
> +
> +       /* Guest side doesn't need init ras feature */
> +       if (amdgpu_sriov_vf(adev))
> +               return 0;
> +
> +       list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
> +               if (!node->ras_obj) {
> +                       dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
> +                       continue;
> +               }
> +
> +               obj = node->ras_obj;
> +
> +               if (!(obj->ras_comm.block == AMDGPU_RAS_BLOCK__SDMA ||
> +                         obj->ras_comm.block == AMDGPU_RAS_BLOCK__GFX))
> +                       continue;
> +
> +               if (obj->ras_late_init) {
> +                       r = obj->ras_late_init(adev, &obj->ras_comm);
> +                       if (r) {
> +                               dev_err(adev->dev, "%s failed to execute ras_late_init!
> ret:%d\n",
> +                                       obj->ras_comm.name, r);
> +                               return r;
> +                       }
> +               } else
> +                       amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
> +       }
> +
> +       return 0;
> +}
> +
>  /* do some fini work before IP fini as dependence */  int
> amdgpu_ras_pre_fini(struct amdgpu_device *adev)  { diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6a941eb8fb8f..5c1ffc5a6899 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -693,6 +693,7 @@ amdgpu_ras_error_to_ta(enum
> amdgpu_ras_error_type error) {
>  /* called in ip_init and ip_fini */
>  int amdgpu_ras_init(struct amdgpu_device *adev);  int
> amdgpu_ras_late_init(struct amdgpu_device *adev);
> +int amdgpu_ras_late_resume(struct amdgpu_device *adev);
>  int amdgpu_ras_fini(struct amdgpu_device *adev);  int
> amdgpu_ras_pre_fini(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..b93a0baeb2d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,9 @@ static int gmc_v10_0_hw_fini(void *handle)
>
>         amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +       if (adev->gmc.ecc_irq.funcs)
> +               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>         return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e633e60850b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,10 @@ static int gmc_v11_0_hw_fini(void *handle)
>         }
>
>         amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> +       if (adev->gmc.ecc_irq.funcs)
> +               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>         gmc_v11_0_gart_disable(adev);
>
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..8ac4d5b7fb37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,9 @@ static int gmc_v9_0_hw_fini(void *handle)
>
>         amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +       if (adev->gmc.ecc_irq.funcs)
> +               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>         return 0;
>  }
>
> --
> 2.25.1
>