[AMD Official Use Only - General] Yes, it should add check ras cap before put gmc.ecc_irq, thanks. Regards, Stanley > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Wednesday, December 20, 2023 4:12 PM > To: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: RE: [PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable > unpaired > > [AMD Official Use Only - General] > > + if (adev->gmc.ecc_irq.funcs) > + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + > > This doesn't match with amdgpu_irq_get call for gmc.ecc_irq, where driver > checks ras cap to decide whether enabling the interrupt or not (see > amdgpu_umc_ras_late_init). We do the same check for amdgpu_irq_put call. > > Regards, > Hawking > > -----Original Message----- > From: Yang, Stanley <Stanley.Yang@xxxxxxx> > Sent: Tuesday, December 19, 2023 20:48 > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable > unpaired > > The ecc_irq is disabled while GPU mode2 reset suspending process, but not > be enabled during GPU mode2 reset resume process. > > Changed from V1: > only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip, > delete amdgpu_ras_late_resume function. > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 > +++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 +++ > 4 files changed, 37 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c > b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > index 02f4c6f9d4f6..b60a3c1bd0f2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c > +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, { > struct list_head *reset_device_list = reset_context->reset_device_list; > struct amdgpu_device *tmp_adev = NULL; > + struct amdgpu_ras *con; > int r; > > if (reset_device_list == NULL) > @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > */ > amdgpu_register_gpu_instance(tmp_adev); > > - /* Resume RAS */ > + /* Resume RAS, ecc_irq */ > + con = amdgpu_ras_get_context(tmp_adev); > + if (!amdgpu_sriov_vf(tmp_adev) && con) { > + if (tmp_adev->sdma.ras && > + amdgpu_ras_is_supported(tmp_adev, > AMDGPU_RAS_BLOCK__SDMA) && > + tmp_adev->sdma.ras->ras_block.ras_late_init) { > + r = tmp_adev->sdma.ras- > >ras_block.ras_late_init(tmp_adev, > + &tmp_adev->sdma.ras->ras_block.ras_comm); > + if (r) { > + dev_err(tmp_adev->dev, "SDMA failed to execute > ras_late_init! ret:%d\n", r); > + goto end; > + } > + } > + > + if (tmp_adev->gfx.ras && > + amdgpu_ras_is_supported(tmp_adev, > AMDGPU_RAS_BLOCK__GFX) && > + tmp_adev->gfx.ras->ras_block.ras_late_init) { > + r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev, > + &tmp_adev->gfx.ras->ras_block.ras_comm); > + if (r) { > + dev_err(tmp_adev->dev, "GFX failed to execute > ras_late_init! ret:%d\n", r); > + goto end; > + } > + } > + } > + > amdgpu_ras_resume(tmp_adev); > > /* Update PSP FW topology after reset */ diff --git > a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index 09cbca596bb5..b93a0baeb2d3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -1043,6 +1043,9 @@ static int gmc_v10_0_hw_fini(void *handle) > > amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); > > + if (adev->gmc.ecc_irq.funcs) > + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > index 416f3e4f0438..e633e60850b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > @@ -941,6 +941,10 @@ static int gmc_v11_0_hw_fini(void *handle) > } > > amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); > + > + if (adev->gmc.ecc_irq.funcs) > + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + > gmc_v11_0_gart_disable(adev); > > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 205db28a9803..8ac4d5b7fb37 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -2388,6 +2388,9 @@ static int gmc_v9_0_hw_fini(void *handle) > > amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); > > + if (adev->gmc.ecc_irq.funcs) > + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + > return 0; > } > > -- > 2.25.1 >