Refactor sdma related code to improve the way to manage irq reference count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric design may cause issue under certain conditions. So 1) introduce amdgpu_sdma_ras_early_fini() to undo work done by amdgpu_sdma_ras_late_init(). 2) remove call of amdgpu_irq_put in xxxx_hw_fini(). 3) call amdgpu_irq_get() in function sdma_v4_4_2_xcp_resume() to keep irq reference count balanced. Currently sdma_v4_4_2_xcp_resume() doesn't invoke ip_blocks[].late_init(amdgpu_irq_get), but sdma_v4_4_2_xcp_suspend() invokes amdgpu_irq_put(), thus causes unbalanced irq reference count. Fix it by calling amdgpu_irq_get() in function sdma_v4_4_2_xcp_resume(). Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 26 ++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 ++ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 -------- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 23 ++++++++++++--------- 5 files changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index fa19c5391d8c..ff5907f2c544 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -383,7 +383,7 @@ enum amdgpu_marker { AMDGPU_MARKER_RAS_DEBUGFS = 63, }; -#define AMDGPU_MARKER_INDEX_IRQ(idx) (AMDGPU_MARKER_INDEX_IRQ0 + (idx)) +#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx)) struct amdgpu_ip_block_status { bool valid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 21938e858d55..799bcd9978da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -110,16 +110,35 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev, AMDGPU_SDMA_IRQ_INSTANCE0 + i); if (r) goto late_fini; + amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ(i)); } } return 0; late_fini: - amdgpu_ras_block_early_fini(adev, ras_block); + amdgpu_sdma_ras_early_fini(adev, ras_block); return r; } +void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + int i; + + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for (i = 0; i < adev->sdma.num_instances; i++) { + if (amdgpu_ras_test_and_clear_marker(adev, ras_block, + AMDGPU_MARKER_IRQ(i))) { + amdgpu_irq_put(adev, &adev->sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } + } + } + + amdgpu_ras_block_early_fini(adev, ras_block); +} + int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry) @@ -334,8 +353,11 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev) adev->sdma.ras_if = &ras->ras_block.ras_comm; /* If not define special ras_late_init function, use default ras_late_init */ - if (!ras->ras_block.ras_late_init) + if (!ras->ras_block.ras_late_init) { + WARN_ON(ras->ras_block.ras_early_fini); ras->ras_block.ras_late_init = amdgpu_sdma_ras_late_init; + ras->ras_block.ras_early_fini = amdgpu_sdma_ras_early_fini; + } /* If not defined special ras_cb function, use default ras_cb */ if (!ras->ras_block.ras_cb) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 087ce0f6fa07..1915e6c9be63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -164,6 +164,8 @@ int amdgpu_sdma_get_index_from_ring(struct amdgpu_ring *ring, uint32_t *index); uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, unsigned vmid); int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); +void amdgpu_sdma_ras_early_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block); int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index ccf0d531776d..369d7094a3ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1968,18 +1968,10 @@ static int sdma_v4_0_hw_init(struct amdgpu_ip_block *ip_block) static int sdma_v4_0_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - int i; if (amdgpu_sriov_vf(adev)) return 0; - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { - for (i = 0; i < adev->sdma.num_instances; i++) { - amdgpu_irq_put(adev, &adev->sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); - } - } - sdma_v4_0_ctx_switch_enable(adev, false); sdma_v4_0_enable(adev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 9c7cea0890c9..744569bbc1e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1486,19 +1486,11 @@ static int sdma_v4_4_2_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; uint32_t inst_mask; - int i; if (amdgpu_sriov_vf(adev)) return 0; inst_mask = GENMASK(adev->sdma.num_instances - 1, 0); - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { - for (i = 0; i < adev->sdma.num_instances; i++) { - amdgpu_irq_put(adev, &adev->sdma.ecc_irq, - AMDGPU_SDMA_IRQ_INSTANCE0 + i); - } - } - sdma_v4_4_2_inst_ctx_switch_enable(adev, false, inst_mask); sdma_v4_4_2_inst_enable(adev, false, inst_mask); @@ -2153,14 +2145,24 @@ const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = { static int sdma_v4_4_2_xcp_resume(void *handle, uint32_t inst_mask) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - int r; + uint32_t tmp_mask = inst_mask; + int r, i; if (!amdgpu_sriov_vf(adev)) sdma_v4_4_2_inst_init_golden_registers(adev, inst_mask); r = sdma_v4_4_2_inst_start(adev, inst_mask); + if (r) + return r; - return r; + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { + for_each_inst(i, tmp_mask) { + amdgpu_irq_get(adev, &adev->sdma.ecc_irq, + AMDGPU_SDMA_IRQ_INSTANCE0 + i); + } + } + + return 0; } static int sdma_v4_4_2_xcp_suspend(void *handle, uint32_t inst_mask) @@ -2366,6 +2368,7 @@ static struct amdgpu_sdma_ras sdma_v4_4_2_ras = { .ras_block = { .hw_ops = &sdma_v4_4_2_ras_hw_ops, .ras_late_init = sdma_v4_4_2_ras_late_init, + .ras_early_fini = amdgpu_sdma_ras_early_fini, }, }; -- 2.43.5