RE: [PATCH 6/6] drm/amdgpu: drop status reset for GCEA 9.4.3 and MMEA 1.8

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Wed, 18 Oct 2023 07:43:41 +0000

[AMD Official Use Only - General]

PMfw doesn't reset any ce/ue status and count in debug mode, who takes responsible for it if in debug mode.

Regards,
Stanley
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Sent: Tuesday, October 17, 2023 8:46 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Li,
> Candice <Candice.Li@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>;
> Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Wang, Yang(Kevin)
> <KevinYang.Wang@xxxxxxx>
> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Subject: [PATCH 6/6] drm/amdgpu: drop status reset for GCEA 9.4.3 and
> MMEA 1.8
>
> PMFW will be responsible for it.
>
> Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 22 -------
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 86 -------------------------
>  2 files changed, 108 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index a1c2c952d882..65da72735e52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -3996,27 +3996,6 @@ static void
> gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev,
>       WREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regVML2_WALKER_MEM_ECC_STATUS, 0x3);  }
>
> -static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device
> *adev,
> -                                     int xcc_id)
> -{
> -     uint32_t i, j;
> -     uint32_t value;
> -
> -     mutex_lock(&adev->grbm_idx_mutex);
> -     for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) {
> -             for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) {
> -                     gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id);
> -                     value = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regGCEA_ERR_STATUS);
> -                     value = REG_SET_FIELD(value, GCEA_ERR_STATUS,
> -                                             CLEAR_ERROR_STATUS, 0x1);
> -                     WREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regGCEA_ERR_STATUS, value);
> -             }
> -     }
> -     gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
> -                     xcc_id);
> -     mutex_unlock(&adev->grbm_idx_mutex);
> -}
> -
>  static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device
> *adev,
>                                       int xcc_id)
>  {
> @@ -4042,7 +4021,6 @@ static void
> gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev,
>                                       void *ras_error_status, int xcc_id)  {
>       gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id);
> -     gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id);
>       gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id);  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> index aa00483e7b37..616d75add087 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> @@ -756,96 +756,10 @@ static void
> mmhub_v1_8_query_ras_error_status(struct amdgpu_device *adev)
>               mmhub_v1_8_inst_query_ras_err_status(adev, i);  }
>
> -static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device
> *adev,
> -                                              uint32_t mmhub_inst)
> -{
> -     uint32_t mmea_cgtt_clk_cntl_addr_dist;
> -     uint32_t mmea_err_status_addr_dist;
> -     uint32_t reg_value;
> -     uint32_t i;
> -
> -     /* reset mmea ras err status */
> -     mmea_cgtt_clk_cntl_addr_dist = regMMEA1_CGTT_CLK_CTRL -
> regMMEA0_CGTT_CLK_CTRL;
> -     mmea_err_status_addr_dist = regMMEA1_ERR_STATUS -
> regMMEA0_ERR_STATUS;
> -     for (i = 0; i < ARRAY_SIZE(mmhub_v1_8_mmea_err_status_reg); i++) {
> -             /* force clk branch on for response path
> -              * set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 1
> -              */
> -             reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                             regMMEA0_CGTT_CLK_CTRL,
> -                                             i *
> mmea_cgtt_clk_cntl_addr_dist);
> -             reg_value = REG_SET_FIELD(reg_value,
> MMEA0_CGTT_CLK_CTRL,
> -                                       SOFT_OVERRIDE_RETURN, 1);
> -             WREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                 regMMEA0_CGTT_CLK_CTRL,
> -                                 i * mmea_cgtt_clk_cntl_addr_dist,
> -                                 reg_value);
> -
> -             /* set MMEA0_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */
> -             reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                             regMMEA0_ERR_STATUS,
> -                                             i *
> mmea_err_status_addr_dist);
> -             reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS,
> -                                       CLEAR_ERROR_STATUS, 1);
> -             WREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                 regMMEA0_ERR_STATUS,
> -                                 i * mmea_err_status_addr_dist,
> -                                 reg_value);
> -
> -             /* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 0
> */
> -             reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                             regMMEA0_CGTT_CLK_CTRL,
> -                                             i *
> mmea_cgtt_clk_cntl_addr_dist);
> -             reg_value = REG_SET_FIELD(reg_value,
> MMEA0_CGTT_CLK_CTRL,
> -                                       SOFT_OVERRIDE_RETURN, 0);
> -             WREG32_SOC15_OFFSET(MMHUB, mmhub_inst,
> -                                 regMMEA0_CGTT_CLK_CTRL,
> -                                 i * mmea_cgtt_clk_cntl_addr_dist,
> -                                 reg_value);
> -     }
> -
> -     /* reset mm_cane ras err status
> -      * force clk branch on for response path
> -      * set MM_CANE_ICG_CTRL.SOFT_OVERRIDE_ATRET = 1
> -      */
> -     reg_value = RREG32_SOC15(MMHUB, mmhub_inst,
> regMM_CANE_ICG_CTRL);
> -     reg_value = REG_SET_FIELD(reg_value, MM_CANE_ICG_CTRL,
> -                               SOFT_OVERRIDE_ATRET, 1);
> -     WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL,
> reg_value);
> -
> -     /* set MM_CANE_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */
> -     reg_value = RREG32_SOC15(MMHUB, mmhub_inst,
> regMM_CANE_ERR_STATUS);
> -     reg_value = REG_SET_FIELD(reg_value, MM_CANE_ERR_STATUS,
> -                               CLEAR_ERROR_STATUS, 1);
> -     WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ERR_STATUS,
> reg_value);
> -
> -     /* set MM_CANE_ICG_CTRL.SOFT_OVERRIDE_ATRET = 0 */
> -     reg_value = RREG32_SOC15(MMHUB, mmhub_inst,
> regMM_CANE_ICG_CTRL);
> -     reg_value = REG_SET_FIELD(reg_value, MM_CANE_ICG_CTRL,
> -                               SOFT_OVERRIDE_ATRET, 0);
> -     WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL,
> reg_value);
> -}
> -
> -static void mmhub_v1_8_reset_ras_error_status(struct amdgpu_device
> *adev) -{
> -     uint32_t inst_mask;
> -     uint32_t i;
> -
> -     if (!amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__MMHUB)) {
> -             dev_warn(adev->dev, "MMHUB RAS is not supported\n");
> -             return;
> -     }
> -
> -     inst_mask = adev->aid_mask;
> -     for_each_inst(i, inst_mask)
> -             mmhub_v1_8_inst_reset_ras_err_status(adev, i);
> -}
> -
>  static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
>       .query_ras_error_count = mmhub_v1_8_query_ras_error_count,
>       .reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,
>       .query_ras_error_status = mmhub_v1_8_query_ras_error_status,
> -     .reset_ras_error_status = mmhub_v1_8_reset_ras_error_status,
>  };
>
>  struct amdgpu_mmhub_ras mmhub_v1_8_ras = {
> --
> 2.35.1