nbio ras is not managed by gpu driver when gpu is connected to cpu through xgmi. split nbio callbacks into ras and non-ras ones so gpu driver only initializes nbio ras callbacks when it manages nbio ras. Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Reivewed-by: Dennis Li <Dennis.Li@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 12 +++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 19 +++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 ++++++++----- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h | 1 + drivers/gpu/drm/amd/amdgpu/soc15.c | 15 ++++++++---- 6 files changed, 63 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 03412543427a..90f50561b43a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -199,13 +199,13 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg) * ack the interrupt if it is there */ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) { - if (adev->nbio.funcs && - adev->nbio.funcs->handle_ras_controller_intr_no_bifring) - adev->nbio.funcs->handle_ras_controller_intr_no_bifring(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring) + adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring(adev); - if (adev->nbio.funcs && - adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring) - adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring) + adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring(adev); } return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 7c11bce4514b..25ee53545837 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -47,6 +47,17 @@ struct nbio_hdp_flush_reg { u32 ref_and_mask_sdma7; }; +struct amdgpu_nbio_ras_funcs { + void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev); + void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev); + int (*init_ras_controller_interrupt)(struct amdgpu_device *adev); + int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev); + void (*query_ras_error_count)(struct amdgpu_device *adev, + void *ras_error_status); + int (*ras_late_init)(struct amdgpu_device *adev); + void (*ras_fini)(struct amdgpu_device *adev); +}; + struct amdgpu_nbio_funcs { const struct nbio_hdp_flush_reg *hdp_flush_reg; u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev); @@ -79,13 +90,6 @@ struct amdgpu_nbio_funcs { void (*ih_control)(struct amdgpu_device *adev); void (*init_registers)(struct amdgpu_device *adev); void (*remap_hdp_registers)(struct amdgpu_device *adev); - void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev); - void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev); - int (*init_ras_controller_interrupt)(struct amdgpu_device *adev); - int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev); - void (*query_ras_error_count)(struct amdgpu_device *adev, - void *ras_error_status); - int (*ras_late_init)(struct amdgpu_device *adev); void (*enable_aspm)(struct amdgpu_device *adev, bool enable); void (*program_aspm)(struct amdgpu_device *adev); @@ -97,6 +101,7 @@ struct amdgpu_nbio { struct amdgpu_irq_src ras_err_event_athub_irq; struct ras_common_if *ras_if; const struct amdgpu_nbio_funcs *funcs; + const struct amdgpu_nbio_ras_funcs *ras_funcs; }; int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1708045e2a0d..ac3f4c3266bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -804,8 +804,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->mmhub.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: - if (adev->nbio.funcs->query_ras_error_count) - adev->nbio.funcs->query_ras_error_count(adev, &err_data); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->query_ras_error_count) + adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: amdgpu_xgmi_query_ras_error_count(adev, &err_data); @@ -2030,14 +2031,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; - if (adev->nbio.funcs->init_ras_controller_interrupt) { - r = adev->nbio.funcs->init_ras_controller_interrupt(adev); + /* initialize nbio ras function ahead of any other + * ras functions so hardware fatal error interrupt + * can be enabled as early as possible */ + switch (adev->asic_type) { + case CHIP_VEGA20: + case CHIP_ARCTURUS: + case CHIP_ALDEBARAN: + if (!adev->gmc.xgmi.connected_to_cpu) + adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; + break; + default: + /* nbio ras is not available */ + break; + } + + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_controller_interrupt) { + r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); if (r) goto release_con; } - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { - r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { + r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); if (r) goto release_con; } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index c477f8972d5d..af44aad78171 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -557,6 +557,16 @@ static void nbio_v7_4_enable_doorbell_interrupt(struct amdgpu_device *adev, DOORBELL_INTERRUPT_DISABLE, enable ? 0 : 1); } +const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs = { + .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring, + .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring, + .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt, + .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt, + .query_ras_error_count = nbio_v7_4_query_ras_error_count, + .ras_late_init = amdgpu_nbio_ras_late_init, + .ras_fini = amdgpu_nbio_ras_fini, +}; + const struct amdgpu_nbio_funcs nbio_v7_4_funcs = { .get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset, .get_hdp_flush_done_offset = nbio_v7_4_get_hdp_flush_done_offset, @@ -577,10 +587,4 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = { .ih_control = nbio_v7_4_ih_control, .init_registers = nbio_v7_4_init_registers, .remap_hdp_registers = nbio_v7_4_remap_hdp_registers, - .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring, - .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring, - .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt, - .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt, - .query_ras_error_count = nbio_v7_4_query_ras_error_count, - .ras_late_init = amdgpu_nbio_ras_late_init, }; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h index b1ac82872752..b8216581ec8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h @@ -28,5 +28,6 @@ extern const struct nbio_hdp_flush_reg nbio_v7_4_hdp_flush_reg; extern const struct amdgpu_nbio_funcs nbio_v7_4_funcs; +extern const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs; #endif diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 64c98b9c7fe8..5c5eb3aed1b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1523,8 +1523,9 @@ static int soc15_common_late_init(void *handle) if (adev->hdp.funcs->reset_ras_error_count) adev->hdp.funcs->reset_ras_error_count(adev); - if (adev->nbio.funcs->ras_late_init) - r = adev->nbio.funcs->ras_late_init(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->ras_late_init) + r = adev->nbio.ras_funcs->ras_late_init(adev); return r; } @@ -1545,7 +1546,9 @@ static int soc15_common_sw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - amdgpu_nbio_ras_fini(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->ras_fini) + adev->nbio.ras_funcs->ras_fini(adev); adev->df.funcs->sw_fini(adev); return 0; } @@ -1609,9 +1612,11 @@ static int soc15_common_hw_fini(void *handle) if (adev->nbio.ras_if && amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) { - if (adev->nbio.funcs->init_ras_controller_interrupt) + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_controller_interrupt) amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0); - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); } -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx