Refactor nbio related code to improve the way to manage irq reference count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric design may cause issue under certain conditions. So 1) introduce amdgpu_nbio_ras_early_fini() to undo work done by amdgpu_nbio_ras_late_init(). 2) remove call of amdgpu_irq_put in xxxx_hw_fini(). 3) record the status where reference count is held for specific irq. Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 16 +++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 1 + drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 + drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 1 + drivers/gpu/drm/amd/amdgpu/soc15.c | 16 ---------------- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c index c75ce91f94ab..b8a69ceec2e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c @@ -64,13 +64,27 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if * r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0); if (r) goto late_fini; + amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ0); r = amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0); if (r) goto late_fini; + amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ1); } return 0; late_fini: - amdgpu_ras_block_early_fini(adev, ras_block); + amdgpu_nbio_ras_early_fini(adev, ras_block); return r; } + +void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) +{ + if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) { + if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ0)) + amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0); + if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ1)) + amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); + } + + amdgpu_ras_block_early_fini(adev, ras_block); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 79c2f807b9fe..e1edf75602c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -117,6 +117,7 @@ struct amdgpu_nbio { int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev); int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); +void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); u64 amdgpu_nbio_get_pcie_replay_count(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 97782a73f4b0..6c727b77bb3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -665,6 +665,7 @@ struct amdgpu_nbio_ras nbio_v7_4_ras = { }, .hw_ops = &nbio_v7_4_ras_hw_ops, .ras_late_init = amdgpu_nbio_ras_late_init, + .ras_early_fini = amdgpu_nbio_ras_early_fini, }, .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring, .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring, diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index 8a0a63ac88d2..684a38a16247 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -703,6 +703,7 @@ struct amdgpu_nbio_ras nbio_v7_9_ras = { }, .hw_ops = &nbio_v7_9_ras_hw_ops, .ras_late_init = amdgpu_nbio_ras_late_init, + .ras_early_fini = amdgpu_nbio_ras_early_fini, }, .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring, .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring, diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 6fcdeb265a22..1dca7d7c813c 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1299,22 +1299,6 @@ static int soc15_common_hw_fini(struct amdgpu_ip_block *ip_block) if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_put_irq(adev); - /* - * For minimal init, late_init is not called, hence RAS irqs are not - * enabled. - */ - if ((!amdgpu_sriov_vf(adev)) && - (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) && - adev->nbio.ras_if && - amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) { - if (adev->nbio.ras && - adev->nbio.ras->init_ras_controller_interrupt) - amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0); - if (adev->nbio.ras && - adev->nbio.ras->init_ras_err_event_athub_interrupt) - amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); - } - return 0; } -- 2.43.5