RE: [PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9

"Zhang, Hawking" <Hawking.Zhang@xxxxxxx> · Mon, 7 Aug 2023 05:46:07 +0000

[AMD Official Use Only - General]

+static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
+static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev,

both functions could be left as dummy ones since by default it is vector #1 selected in bare-metal environment. Only SRIOV PF driver needs to select vector #4.

Others look good to me. The patch is

Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Sent: Monday, August 7, 2023 11:06
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Yang, Stanley <Stanley.Yang@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Subject: [PATCH] drm/amdgpu: add RAS fatal error handler for NBIO v7.9

Register RAS fatal error interrupt and add handler.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   4 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c  | 219 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h  |   1 +
 3 files changed, 224 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 832fa646b38f..bef0f9264b4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,6 +35,7 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "nbio_v4_3.h"
+#include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"

@@ -2663,6 +2664,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                         * check DF RAS */
                        adev->nbio.ras = &nbio_v4_3_ras;
                break;
+       case IP_VERSION(7, 9, 0):
+               adev->nbio.ras = &nbio_v7_9_ras;
+               break;
        default:
                /* nbio ras is not available */
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index cd1a02d30420..cc2268b871e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -451,3 +451,222 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
        .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
        .init_registers = nbio_v7_9_init_registers,  };
+
+static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev,
+                                       void *ras_error_status)
+{
+       return;
+}
+
+static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct
+amdgpu_device *adev) {
+       uint32_t bif_doorbell_intr_cntl;
+       struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+       bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0,
+regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+       if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+               BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
+               /* driver has to clear the interrupt status when bif ring is disabled */
+               bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+                                               BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                                               RAS_CNTLR_INTERRUPT_CLEAR, 1);
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL,
+bif_doorbell_intr_cntl);
+
+               if (!ras->disable_ras_err_cnt_harvest) {
+                       /*
+                        * clear error status after ras_controller_intr
+                        * according to hw team and count ue number
+                        * for query
+                        */
+                       nbio_v7_9_query_ras_error_count(adev, &err_data);
+
+                       /* logging on error cnt and printing for awareness */
+                       obj->err_data.ue_count += err_data.ue_count;
+                       obj->err_data.ce_count += err_data.ce_count;
+
+                       if (err_data.ce_count)
+                               dev_info(adev->dev, "%ld correctable hardware "
+                                               "errors detected in %s block, "
+                                               "no user action is needed.\n",
+                                               obj->err_data.ce_count,
+                                               get_ras_block_str(adev->nbio.ras_if));
+
+                       if (err_data.ue_count)
+                               dev_info(adev->dev, "%ld uncorrectable hardware "
+                                               "errors detected in %s block\n",
+                                               obj->err_data.ue_count,
+                                               get_ras_block_str(adev->nbio.ras_if));
+               }
+
+               dev_info(adev->dev, "RAS controller interrupt triggered "
+                                       "by NBIF error\n");
+
+               /* ras_controller_int is dedicated for nbif ras error,
+                * not the global interrupt for sync flood
+                */
+               amdgpu_ras_reset_gpu(adev);
+       }
+}
+
+static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct
+amdgpu_device *adev) {
+       uint32_t bif_doorbell_intr_cntl;
+
+       bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0,
+regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+       if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+               BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+               /* driver has to clear the interrupt status when bif ring is disabled */
+               bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+                                               BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                                               RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL,
+bif_doorbell_intr_cntl);
+
+               amdgpu_ras_global_ras_isr(adev);
+       }
+}
+
+static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev,
+                                                 struct amdgpu_irq_src *src,
+                                                 unsigned type,
+                                                 enum amdgpu_interrupt_state state) {
+       /* The ras_controller_irq enablement should be done in psp bl when it
+        * tries to enable ras feature. Driver only need to set the correct interrupt
+        * vector for bare-metal and sriov use case respectively
+        */
+       uint32_t bif_intr_cntl;
+
+       bif_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL);
+
+       if (state == AMDGPU_IRQ_STATE_ENABLE) {
+               /* set interrupt vector select bit to 0 to select
+                * vetcor 1 for bare metal case */
+               bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl,
+                                             BIF_BX0_BIF_INTR_CNTL,
+                                             RAS_INTR_VEC_SEL, 0);
+
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL, bif_intr_cntl);
+       }
+
+       return 0;
+}
+
+static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev,
+                                               struct amdgpu_irq_src *source,
+                                               struct amdgpu_iv_entry *entry)
+{
+       /* By design, the ih cookie for ras_controller_irq should be written
+        * to BIFring instead of general iv ring. However, due to known bif ring
+        * hw bug, it has to be disabled. There is no chance the process function
+        * will be involked. Just left it as a dummy one.
+        */
+       return 0;
+}
+
+static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
+                                                      struct amdgpu_irq_src *src,
+                                                      unsigned type,
+                                                      enum amdgpu_interrupt_state state) {
+       /* The ras_controller_irq enablement should be done in psp bl when it
+        * tries to enable ras feature. Driver only need to set the correct interrupt
+        * vector for bare-metal and sriov use case respectively
+        */
+       uint32_t bif_intr_cntl;
+
+       bif_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL);
+
+       if (state == AMDGPU_IRQ_STATE_ENABLE) {
+               /* set interrupt vector select bit to 0 to select
+                * vetcor 1 for bare metal case */
+               bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl,
+                                             BIF_BX0_BIF_INTR_CNTL,
+                                             RAS_INTR_VEC_SEL, 0);
+
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_INTR_CNTL, bif_intr_cntl);
+       }
+
+       return 0;
+}
+
+static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev,
+                                                struct amdgpu_irq_src *source,
+                                                struct amdgpu_iv_entry *entry)
+{
+       /* By design, the ih cookie for err_event_athub_irq should be written
+        * to BIFring instead of general iv ring. However, due to known bif ring
+        * hw bug, it has to be disabled. There is no chance the process function
+        * will be involked. Just left it as a dummy one.
+        */
+       return 0;
+}
+
+static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
+       .set = nbio_v7_9_set_ras_controller_irq_state,
+       .process = nbio_v7_9_process_ras_controller_irq,
+};
+
+static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
+       .set = nbio_v7_9_set_ras_err_event_athub_irq_state,
+       .process = nbio_v7_9_process_err_event_athub_irq,
+};
+
+static int nbio_v7_9_init_ras_controller_interrupt (struct
+amdgpu_device *adev) {
+       int r;
+
+       /* init the irq funcs */
+       adev->nbio.ras_controller_irq.funcs =
+               &nbio_v7_9_ras_controller_irq_funcs;
+       adev->nbio.ras_controller_irq.num_types = 1;
+
+       /* register ras controller interrupt */
+       r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
+                             NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT,
+                             &adev->nbio.ras_controller_irq);
+
+       return r;
+}
+
+static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct
+amdgpu_device *adev) {
+
+       int r;
+
+       /* init the irq funcs */
+       adev->nbio.ras_err_event_athub_irq.funcs =
+               &nbio_v7_9_ras_err_event_athub_irq_funcs;
+       adev->nbio.ras_err_event_athub_irq.num_types = 1;
+
+       /* register ras err event athub interrupt */
+       r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
+                             NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
+                             &adev->nbio.ras_err_event_athub_irq);
+
+       return r;
+}
+
+const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = {
+       .query_ras_error_count = nbio_v7_9_query_ras_error_count, };
+
+struct amdgpu_nbio_ras nbio_v7_9_ras = {
+       .ras_block = {
+               .ras_comm = {
+                       .name = "pcie_bif",
+                       .block = AMDGPU_RAS_BLOCK__PCIE_BIF,
+                       .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+               },
+               .hw_ops = &nbio_v7_9_ras_hw_ops,
+               .ras_late_init = amdgpu_nbio_ras_late_init,
+       },
+       .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring,
+       .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
+       .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt,
+       .init_ras_err_event_athub_interrupt =
+nbio_v7_9_init_ras_err_event_athub_interrupt,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h
index 8e04eb484328..73709771950d 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h
@@ -28,5 +28,6 @@

 extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg;  extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs;
+extern struct amdgpu_nbio_ras nbio_v7_9_ras;

 #endif
--
2.35.1