[AMD Official Use Only - General] +static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, +static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev, both functions could be left as dummy ones since by default it is vector #1 selected in bare-metal environment. Only SRIOV PF driver needs to select vector #4. Others look good to me. The patch is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Monday, August 7, 2023 11:06 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Yang, Stanley <Stanley.Yang@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9 Register RAS fatal error interrupt and add handler. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 219 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h | 1 + 3 files changed, 224 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 832fa646b38f..bef0f9264b4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,7 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "nbio_v4_3.h" +#include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2663,6 +2664,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * check DF RAS */ adev->nbio.ras = &nbio_v4_3_ras; break; + case IP_VERSION(7, 9, 0): + adev->nbio.ras = &nbio_v7_9_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index cd1a02d30420..cc2268b871e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -451,3 +451,222 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = { .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, .init_registers = nbio_v7_9_init_registers, }; + +static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + return; +} + +static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct +amdgpu_device *adev) { + uint32_t bif_doorbell_intr_cntl; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, +regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, +bif_doorbell_intr_cntl); + + if (!ras->disable_ras_err_cnt_harvest) { + /* + * clear error status after ras_controller_intr + * according to hw team and count ue number + * for query + */ + nbio_v7_9_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + get_ras_block_str(adev->nbio.ras_if)); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + get_ras_block_str(adev->nbio.ras_if)); + } + + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); + + /* ras_controller_int is dedicated for nbif ras error, + * not the global interrupt for sync flood + */ + amdgpu_ras_reset_gpu(adev); + } +} + +static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct +amdgpu_device *adev) { + uint32_t bif_doorbell_intr_cntl; + + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, +regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, +bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); + } +} + +static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) { + /* The ras_controller_irq enablement should be done in psp bl when it + * tries to enable ras feature. Driver only need to set the correct interrupt + * vector for bare-metal and sriov use case respectively + */ + uint32_t bif_intr_cntl; + + bif_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL); + + if (state == AMDGPU_IRQ_STATE_ENABLE) { + /* set interrupt vector select bit to 0 to select + * vetcor 1 for bare metal case */ + bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl, + BIF_BX0_BIF_INTR_CNTL, + RAS_INTR_VEC_SEL, 0); + + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL, bif_intr_cntl); + } + + return 0; +} + +static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for ras_controller_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) { + /* The ras_controller_irq enablement should be done in psp bl when it + * tries to enable ras feature. Driver only need to set the correct interrupt + * vector for bare-metal and sriov use case respectively + */ + uint32_t bif_intr_cntl; + + bif_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL); + + if (state == AMDGPU_IRQ_STATE_ENABLE) { + /* set interrupt vector select bit to 0 to select + * vetcor 1 for bare metal case */ + bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl, + BIF_BX0_BIF_INTR_CNTL, + RAS_INTR_VEC_SEL, 0); + + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL, bif_intr_cntl); + } + + return 0; +} + +static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = { + .set = nbio_v7_9_set_ras_controller_irq_state, + .process = nbio_v7_9_process_ras_controller_irq, +}; + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = { + .set = nbio_v7_9_set_ras_err_event_athub_irq_state, + .process = nbio_v7_9_process_err_event_athub_irq, +}; + +static int nbio_v7_9_init_ras_controller_interrupt (struct +amdgpu_device *adev) { + int r; + + /* init the irq funcs */ + adev->nbio.ras_controller_irq.funcs = + &nbio_v7_9_ras_controller_irq_funcs; + adev->nbio.ras_controller_irq.num_types = 1; + + /* register ras controller interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT, + &adev->nbio.ras_controller_irq); + + return r; +} + +static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct +amdgpu_device *adev) { + + int r; + + /* init the irq funcs */ + adev->nbio.ras_err_event_athub_irq.funcs = + &nbio_v7_9_ras_err_event_athub_irq_funcs; + adev->nbio.ras_err_event_athub_irq.num_types = 1; + + /* register ras err event athub interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, + &adev->nbio.ras_err_event_athub_irq); + + return r; +} + +const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = { + .query_ras_error_count = nbio_v7_9_query_ras_error_count, }; + +struct amdgpu_nbio_ras nbio_v7_9_ras = { + .ras_block = { + .ras_comm = { + .name = "pcie_bif", + .block = AMDGPU_RAS_BLOCK__PCIE_BIF, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + }, + .hw_ops = &nbio_v7_9_ras_hw_ops, + .ras_late_init = amdgpu_nbio_ras_late_init, + }, + .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring, + .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring, + .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt, + .init_ras_err_event_athub_interrupt = +nbio_v7_9_init_ras_err_event_athub_interrupt, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h index 8e04eb484328..73709771950d 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h @@ -28,5 +28,6 @@ extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg; extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs; +extern struct amdgpu_nbio_ras nbio_v7_9_ras; #endif -- 2.35.1