Separate RAS poison consumption handling from the instance irq, and register dedicated ras_poison_irq src and funcs for UVD_POISON. Fix the amdgpu_irq_put call trace in vcn_v4_0_hw_fini. [ 44.563572] RIP: 0010:amdgpu_irq_put+0xa4/0xc0 [amdgpu] [ 44.563629] RSP: 0018:ffffb36740edfc90 EFLAGS: 00010246 [ 44.563630] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 [ 44.563630] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 44.563631] RBP: ffffb36740edfcb0 R08: 0000000000000000 R09: 0000000000000000 [ 44.563631] R10: 0000000000000000 R11: 0000000000000000 R12: ffff954c568e2ea8 [ 44.563631] R13: 0000000000000000 R14: ffff954c568c0000 R15: ffff954c568e2ea8 [ 44.563632] FS: 0000000000000000(0000) GS:ffff954f584c0000(0000) knlGS:0000000000000000 [ 44.563632] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 44.563633] CR2: 00007f028741ba70 CR3: 000000026ca10000 CR4: 0000000000750ee0 [ 44.563633] PKRU: 55555554 [ 44.563633] Call Trace: [ 44.563634] <TASK> [ 44.563634] vcn_v4_0_hw_fini+0x62/0x160 [amdgpu] [ 44.563700] vcn_v4_0_suspend+0x13/0x30 [amdgpu] [ 44.563755] amdgpu_device_ip_suspend_phase2+0x240/0x470 [amdgpu] [ 44.563806] amdgpu_device_ip_suspend+0x41/0x80 [amdgpu] [ 44.563858] amdgpu_device_pre_asic_reset+0xd9/0x4a0 [amdgpu] [ 44.563909] amdgpu_device_gpu_recover.cold+0x548/0xcf1 [amdgpu] [ 44.564006] amdgpu_debugfs_reset_work+0x4c/0x80 [amdgpu] [ 44.564061] process_one_work+0x21f/0x400 [ 44.564062] worker_thread+0x200/0x3f0 [ 44.564063] ? process_one_work+0x400/0x400 [ 44.564064] kthread+0xee/0x120 [ 44.564065] ? kthread_complete_and_exit+0x20/0x20 [ 44.564066] ret_from_fork+0x22/0x30 Suggested-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Signed-off-by: Horatio Zhang <Hongkun.Zhang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 27 ++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 3 +++ drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 24 ++++++++++++++--- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 35 ++++++++++++++++++++----- 4 files changed, 78 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index e63fcc58e8e0..f53c22db8d25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1181,6 +1181,31 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev, return 0; } +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) +{ + int r, i; + + r = amdgpu_ras_block_late_init(adev, ras_block); + if (r) + return r; + + if (amdgpu_ras_is_supported(adev, ras_block->block)) { + for (i = 0; i < adev->vcn.num_vcn_inst; i++) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + + r = amdgpu_irq_get(adev, &adev->vcn.inst[i].ras_poison_irq, 0); + if (r) + goto late_fini; + } + } + return 0; + +late_fini: + amdgpu_ras_block_late_fini(adev, ras_block); + return r; +} + int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev) { int err; @@ -1202,7 +1227,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev) adev->vcn.ras_if = &ras->ras_block.ras_comm; if (!ras->ras_block.ras_late_init) - ras->ras_block.ras_late_init = amdgpu_ras_block_late_init; + ras->ras_block.ras_late_init = amdgpu_vcn_ras_late_init; return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index c730949ece7d..802d4c2edb41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -234,6 +234,7 @@ struct amdgpu_vcn_inst { struct amdgpu_ring ring_enc[AMDGPU_VCN_MAX_ENC_RINGS]; atomic_t sched_score; struct amdgpu_irq_src irq; + struct amdgpu_irq_src ras_poison_irq; struct amdgpu_vcn_reg external; struct amdgpu_bo *dpg_sram_bo; struct dpg_pause_state pause_state; @@ -400,6 +401,8 @@ void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device *adev, int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block); int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index ab0b45d0ead1..44262d67b3ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -143,7 +143,7 @@ static int vcn_v2_5_sw_init(void *handle) /* VCN POISON TRAP */ r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[j], - VCN_2_6__SRCID_UVD_POISON, &adev->vcn.inst[j].irq); + VCN_2_6__SRCID_UVD_POISON, &adev->vcn.inst[j].ras_poison_irq); if (r) return r; } @@ -354,6 +354,9 @@ static int vcn_v2_5_hw_fini(void *handle) (adev->vcn.cur_state != AMD_PG_STATE_GATE && RREG32_SOC15(VCN, i, mmUVD_STATUS))) vcn_v2_5_set_powergating_state(adev, AMD_PG_STATE_GATE); + + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN)) + amdgpu_irq_put(adev, &adev->vcn.inst[i].ras_poison_irq, 0); } return 0; @@ -1807,6 +1810,14 @@ static int vcn_v2_5_set_interrupt_state(struct amdgpu_device *adev, return 0; } +static int vcn_v2_6_set_ras_interrupt_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + unsigned int type, + enum amdgpu_interrupt_state state) +{ + return 0; +} + static int vcn_v2_5_process_interrupt(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) @@ -1837,9 +1848,6 @@ static int vcn_v2_5_process_interrupt(struct amdgpu_device *adev, case VCN_2_0__SRCID__UVD_ENC_LOW_LATENCY: amdgpu_fence_process(&adev->vcn.inst[ip_instance].ring_enc[1]); break; - case VCN_2_6__SRCID_UVD_POISON: - amdgpu_vcn_process_poison_irq(adev, source, entry); - break; default: DRM_ERROR("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data[0]); @@ -1854,6 +1862,11 @@ static const struct amdgpu_irq_src_funcs vcn_v2_5_irq_funcs = { .process = vcn_v2_5_process_interrupt, }; +static const struct amdgpu_irq_src_funcs vcn_v2_6_ras_irq_funcs = { + .set = vcn_v2_6_set_ras_interrupt_state, + .process = amdgpu_vcn_process_poison_irq, +}; + static void vcn_v2_5_set_irq_funcs(struct amdgpu_device *adev) { int i; @@ -1863,6 +1876,9 @@ static void vcn_v2_5_set_irq_funcs(struct amdgpu_device *adev) continue; adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1; adev->vcn.inst[i].irq.funcs = &vcn_v2_5_irq_funcs; + + adev->vcn.inst[i].ras_poison_irq.num_types = adev->vcn.num_enc_rings + 1; + adev->vcn.inst[i].ras_poison_irq.funcs = &vcn_v2_6_ras_irq_funcs; } } diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index bf0674039598..1dfc7cee6d59 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -139,7 +139,7 @@ static int vcn_v4_0_sw_init(void *handle) /* VCN POISON TRAP */ r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[i], - VCN_4_0__SRCID_UVD_POISON, &adev->vcn.inst[i].irq); + VCN_4_0__SRCID_UVD_POISON, &adev->vcn.inst[i].ras_poison_irq); if (r) return r; @@ -305,8 +305,8 @@ static int vcn_v4_0_hw_fini(void *handle) vcn_v4_0_set_powergating_state(adev, AMD_PG_STATE_GATE); } } - - amdgpu_irq_put(adev, &adev->vcn.inst[i].irq, 0); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN)) + amdgpu_irq_put(adev, &adev->vcn.inst[i].ras_poison_irq, 0); } return 0; @@ -1975,6 +1975,24 @@ static int vcn_v4_0_set_interrupt_state(struct amdgpu_device *adev, struct amdgp return 0; } +/** + * vcn_v4_0_set_ras_interrupt_state - set VCN block RAS interrupt state + * + * @adev: amdgpu_device pointer + * @source: interrupt sources + * @type: interrupt types + * @state: interrupt states + * + * Set VCN block RAS interrupt state + */ +static int vcn_v4_0_set_ras_interrupt_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + unsigned int type, + enum amdgpu_interrupt_state state) +{ + return 0; +} + /** * vcn_v4_0_process_interrupt - process VCN block interrupt * @@ -2007,9 +2025,6 @@ static int vcn_v4_0_process_interrupt(struct amdgpu_device *adev, struct amdgpu_ case VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE: amdgpu_fence_process(&adev->vcn.inst[ip_instance].ring_enc[0]); break; - case VCN_4_0__SRCID_UVD_POISON: - amdgpu_vcn_process_poison_irq(adev, source, entry); - break; default: DRM_ERROR("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data[0]); @@ -2024,6 +2039,11 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_irq_funcs = { .process = vcn_v4_0_process_interrupt, }; +static const struct amdgpu_irq_src_funcs vcn_v4_0_ras_irq_funcs = { + .set = vcn_v4_0_set_ras_interrupt_state, + .process = amdgpu_vcn_process_poison_irq, +}; + /** * vcn_v4_0_set_irq_funcs - set VCN block interrupt irq functions * @@ -2041,6 +2061,9 @@ static void vcn_v4_0_set_irq_funcs(struct amdgpu_device *adev) adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1; adev->vcn.inst[i].irq.funcs = &vcn_v4_0_irq_funcs; + + adev->vcn.inst[i].ras_poison_irq.num_types = adev->vcn.num_enc_rings + 1; + adev->vcn.inst[i].ras_poison_irq.funcs = &vcn_v4_0_ras_irq_funcs; } } -- 2.34.1