GPU will stop working once fatal error is detected. it will inform driver to do reset to recover from the fatal error. Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++ drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 79 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h | 1 + drivers/gpu/drm/amd/amdgpu/soc21.c | 15 ++++- 4 files changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c6dc3cd2a9de..5b1779021881 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "nbio_v4_3.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) adev->nbio.ras = &nbio_v7_4_ras; break; + case IP_VERSION(4, 3, 0): + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF) + /* unlike other generation of nbio ras, + * nbio v4_3 only support fatal error interrupt + * to inform software that DF is freezed due to + * system fatal error event. driver should not + * enable nbio ras in such case. Instead, + * check DF RAS */ + adev->nbio.ras = &nbio_v4_3_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 09fdcd20cb91..d5ed9e0e1a5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -26,6 +26,7 @@ #include "nbio/nbio_4_3_0_offset.h" #include "nbio/nbio_4_3_0_sh_mask.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include <uapi/linux/kfd_ioctl.h> static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = { .remap_hdp_registers = nbio_v4_3_remap_hdp_registers, .get_rom_offset = nbio_v4_3_get_rom_offset, }; + +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* The ras_controller_irq enablement should be done in psp bl when it + * tries to enable ras feature. Driver only need to set the correct interrupt + * vector for bare-metal and sriov use case respectively + */ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, + (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); + + return 0; +} + +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written + * to bif ring. since bif ring is not enabled, just leave process callback + * as a dummy one. + */ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v4_3_ras_err_event_athub_irq_funcs = { + .set = nbio_v4_3_set_ras_err_event_athub_irq_state, + .process = nbio_v4_3_process_err_event_athub_irq, +}; + +static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + if (REG_GET_FIELD(bif_doorbell_int_cntl, + BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, + BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); + amdgpu_ras_global_ras_isr(adev); + } +} + +static int nbio_v4_3_init_ras_err_event_athub_interrupt(struct amdgpu_device *adev) +{ + + int r; + + /* init the irq funcs */ + adev->nbio.ras_err_event_athub_irq.funcs = + &nbio_v4_3_ras_err_event_athub_irq_funcs; + adev->nbio.ras_err_event_athub_irq.num_types = 1; + + /* register ras err event athub interrupt + * nbio v4_3 uses the same irq source as nbio v7_4 */ + r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, + &adev->nbio.ras_err_event_athub_irq); + + return r; +} + +struct amdgpu_nbio_ras nbio_v4_3_ras = { + .handle_ras_err_event_athub_intr_no_bifring = nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring, + .init_ras_err_event_athub_interrupt = nbio_v4_3_init_ras_err_event_athub_interrupt, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h index 711999ceedf4..399037cdf4fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h @@ -29,5 +29,6 @@ extern const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg; extern const struct amdgpu_nbio_funcs nbio_v4_3_funcs; extern const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs; +extern struct amdgpu_nbio_ras nbio_v4_3_ras; #endif diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 67580761b44d..514bfc705d5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -754,6 +754,14 @@ static int soc21_common_late_init(void *handle) sriov_vcn_4_0_0_video_codecs_decode_array_vcn0, ARRAY_SIZE(sriov_vcn_4_0_0_video_codecs_decode_array_vcn0)); } + } else { + if (adev->nbio.ras && + adev->nbio.ras_err_event_athub_irq.funcs) + /* don't need to fail gpu late init + * if enabling athub_err_event interrupt failed + * nbio v4_3 only support fatal error hanlding + * just enable the interrupt directly */ + amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0); } return 0; @@ -801,8 +809,13 @@ static int soc21_common_hw_fini(void *handle) /* disable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, false); - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { xgpu_nv_mailbox_put_irq(adev); + } else { + if (adev->nbio.ras && + adev->nbio.ras_err_event_athub_irq.funcs) + amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); + } return 0; } -- 2.17.1