[AMD Official Use Only - General] Thank you for your suggestion, I will split these two patches in the next version. Regards, Horatio -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Monday, May 15, 2023 2:38 PM To: Zhang, Horatio <Hongkun.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Xu, Feifei <Feifei.Xu@xxxxxxx>; Liu, Leo <Leo.Liu@xxxxxxx>; Jiang, Sonny <Sonny.Jiang@xxxxxxx>; Limonciello, Mario <Mario.Limonciello@xxxxxxx>; Liu, HaoPing (Alan) <HaoPing.Liu@xxxxxxx>; Zhou, Bob <Bob.Zhou@xxxxxxx>; Zhang, Horatio <Hongkun.Zhang@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> Subject: RE: [PATCH v2 1/2] drm/amdgpu: separate ras irq from vcn instance irq for UVD_POISON [AMD Official Use Only - General] The code is fine with me, but it's better to split the patch into three parts, one is for common vcn code, one is for vcn 2.6 and the third one is for vcn 4.0. Regards, Tao > -----Original Message----- > From: Horatio Zhang <Hongkun.Zhang@xxxxxxx> > Sent: Monday, May 15, 2023 10:28 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Xu, Feifei <Feifei.Xu@xxxxxxx>; Liu, Leo > <Leo.Liu@xxxxxxx>; Jiang, Sonny <Sonny.Jiang@xxxxxxx>; Limonciello, > Mario <Mario.Limonciello@xxxxxxx>; Liu, HaoPing (Alan) > <HaoPing.Liu@xxxxxxx>; Zhou, Bob <Bob.Zhou@xxxxxxx>; Zhang, Horatio > <Hongkun.Zhang@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH v2 1/2] drm/amdgpu: separate ras irq from vcn instance > irq for UVD_POISON > > Separate RAS poison consumption handling from the instance irq, and > register dedicated ras_poison_irq src and funcs for UVD_POISON. Fix > the amdgpu_irq_put call trace in vcn_v4_0_hw_fini. > > [ 44.563572] RIP: 0010:amdgpu_irq_put+0xa4/0xc0 [amdgpu] > [ 44.563629] RSP: 0018:ffffb36740edfc90 EFLAGS: 00010246 > [ 44.563630] RAX: 0000000000000000 RBX: 0000000000000001 RCX: > 0000000000000000 > [ 44.563630] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > [ 44.563631] RBP: ffffb36740edfcb0 R08: 0000000000000000 R09: > 0000000000000000 > [ 44.563631] R10: 0000000000000000 R11: 0000000000000000 R12: > ffff954c568e2ea8 > [ 44.563631] R13: 0000000000000000 R14: ffff954c568c0000 R15: > ffff954c568e2ea8 > [ 44.563632] FS: 0000000000000000(0000) GS:ffff954f584c0000(0000) > knlGS:0000000000000000 > [ 44.563632] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 44.563633] CR2: 00007f028741ba70 CR3: 000000026ca10000 CR4: > 0000000000750ee0 > [ 44.563633] PKRU: 55555554 > [ 44.563633] Call Trace: > [ 44.563634] <TASK> > [ 44.563634] vcn_v4_0_hw_fini+0x62/0x160 [amdgpu] > [ 44.563700] vcn_v4_0_suspend+0x13/0x30 [amdgpu] > [ 44.563755] amdgpu_device_ip_suspend_phase2+0x240/0x470 [amdgpu] > [ 44.563806] amdgpu_device_ip_suspend+0x41/0x80 [amdgpu] > [ 44.563858] amdgpu_device_pre_asic_reset+0xd9/0x4a0 [amdgpu] > [ 44.563909] amdgpu_device_gpu_recover.cold+0x548/0xcf1 [amdgpu] > [ 44.564006] amdgpu_debugfs_reset_work+0x4c/0x80 [amdgpu] > [ 44.564061] process_one_work+0x21f/0x400 > [ 44.564062] worker_thread+0x200/0x3f0 > [ 44.564063] ? process_one_work+0x400/0x400 > [ 44.564064] kthread+0xee/0x120 > [ 44.564065] ? kthread_complete_and_exit+0x20/0x20 > [ 44.564066] ret_from_fork+0x22/0x30 > > Suggested-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Signed-off-by: Horatio Zhang <Hongkun.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 27 ++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 3 +++ > drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 24 ++++++++++++++--- > drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 35 ++++++++++++++++++++----- > 4 files changed, 78 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > index e63fcc58e8e0..f53c22db8d25 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > @@ -1181,6 +1181,31 @@ int amdgpu_vcn_process_poison_irq(struct > amdgpu_device *adev, > return 0; > } > > +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct > +ras_common_if *ras_block) { > + int r, i; > + > + r = amdgpu_ras_block_late_init(adev, ras_block); > + if (r) > + return r; > + > + if (amdgpu_ras_is_supported(adev, ras_block->block)) { > + for (i = 0; i < adev->vcn.num_vcn_inst; i++) { > + if (adev->vcn.harvest_config & (1 << i)) > + continue; > + > + r = amdgpu_irq_get(adev, &adev- > >vcn.inst[i].ras_poison_irq, 0); > + if (r) > + goto late_fini; > + } > + } > + return 0; > + > +late_fini: > + amdgpu_ras_block_late_fini(adev, ras_block); > + return r; > +} > + > int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev) { > int err; > @@ -1202,7 +1227,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device > *adev) > adev->vcn.ras_if = &ras->ras_block.ras_comm; > > if (!ras->ras_block.ras_late_init) > - ras->ras_block.ras_late_init = amdgpu_ras_block_late_init; > + ras->ras_block.ras_late_init = amdgpu_vcn_ras_late_init; > > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h > index c730949ece7d..802d4c2edb41 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h > @@ -234,6 +234,7 @@ struct amdgpu_vcn_inst { > struct amdgpu_ring ring_enc[AMDGPU_VCN_MAX_ENC_RINGS]; > atomic_t sched_score; > struct amdgpu_irq_src irq; > + struct amdgpu_irq_src ras_poison_irq; > struct amdgpu_vcn_reg external; > struct amdgpu_bo *dpg_sram_bo; > struct dpg_pause_state pause_state; > @@ -400,6 +401,8 @@ void amdgpu_debugfs_vcn_fwlog_init(struct > amdgpu_device *adev, int amdgpu_vcn_process_poison_irq(struct > amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry); > +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, > + struct ras_common_if *ras_block); > int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev); > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c > b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c > index ab0b45d0ead1..44262d67b3ff 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c > @@ -143,7 +143,7 @@ static int vcn_v2_5_sw_init(void *handle) > > /* VCN POISON TRAP */ > r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[j], > - VCN_2_6__SRCID_UVD_POISON, &adev- > >vcn.inst[j].irq); > + VCN_2_6__SRCID_UVD_POISON, &adev- > >vcn.inst[j].ras_poison_irq); > if (r) > return r; > } > @@ -354,6 +354,9 @@ static int vcn_v2_5_hw_fini(void *handle) > (adev->vcn.cur_state != AMD_PG_STATE_GATE && > RREG32_SOC15(VCN, i, mmUVD_STATUS))) > vcn_v2_5_set_powergating_state(adev, > AMD_PG_STATE_GATE); > + > + if (amdgpu_ras_is_supported(adev, > AMDGPU_RAS_BLOCK__VCN)) > + amdgpu_irq_put(adev, &adev- > >vcn.inst[i].ras_poison_irq, 0); > } > > return 0; > @@ -1807,6 +1810,14 @@ static int vcn_v2_5_set_interrupt_state(struct > amdgpu_device *adev, > return 0; > } > > +static int vcn_v2_6_set_ras_interrupt_state(struct amdgpu_device *adev, > + struct amdgpu_irq_src *source, > + unsigned int type, > + enum amdgpu_interrupt_state state) { > + return 0; > +} > + > static int vcn_v2_5_process_interrupt(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry) @@ -1837,9 > +1848,6 @@ static int vcn_v2_5_process_interrupt(struct amdgpu_device > +*adev, > case VCN_2_0__SRCID__UVD_ENC_LOW_LATENCY: > amdgpu_fence_process(&adev- > >vcn.inst[ip_instance].ring_enc[1]); > break; > - case VCN_2_6__SRCID_UVD_POISON: > - amdgpu_vcn_process_poison_irq(adev, source, entry); > - break; > default: > DRM_ERROR("Unhandled interrupt: %d %d\n", > entry->src_id, entry->src_data[0]); @@ -1854,6 > +1862,11 @@ static const struct amdgpu_irq_src_funcs > +vcn_v2_5_irq_funcs = { > .process = vcn_v2_5_process_interrupt, }; > > +static const struct amdgpu_irq_src_funcs vcn_v2_6_ras_irq_funcs = { > + .set = vcn_v2_6_set_ras_interrupt_state, > + .process = amdgpu_vcn_process_poison_irq, }; > + > static void vcn_v2_5_set_irq_funcs(struct amdgpu_device *adev) { > int i; > @@ -1863,6 +1876,9 @@ static void vcn_v2_5_set_irq_funcs(struct > amdgpu_device *adev) > continue; > adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1; > adev->vcn.inst[i].irq.funcs = &vcn_v2_5_irq_funcs; > + > + adev->vcn.inst[i].ras_poison_irq.num_types = adev- > >vcn.num_enc_rings + 1; > + adev->vcn.inst[i].ras_poison_irq.funcs = > &vcn_v2_6_ras_irq_funcs; > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c > b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c > index bf0674039598..1dfc7cee6d59 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c > @@ -139,7 +139,7 @@ static int vcn_v4_0_sw_init(void *handle) > > /* VCN POISON TRAP */ > r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[i], > - VCN_4_0__SRCID_UVD_POISON, &adev- > >vcn.inst[i].irq); > + VCN_4_0__SRCID_UVD_POISON, &adev- > >vcn.inst[i].ras_poison_irq); > if (r) > return r; > > @@ -305,8 +305,8 @@ static int vcn_v4_0_hw_fini(void *handle) > vcn_v4_0_set_powergating_state(adev, AMD_PG_STATE_GATE); > } > } > - > - amdgpu_irq_put(adev, &adev->vcn.inst[i].irq, 0); > + if (amdgpu_ras_is_supported(adev, > AMDGPU_RAS_BLOCK__VCN)) > + amdgpu_irq_put(adev, &adev- > >vcn.inst[i].ras_poison_irq, 0); > } > > return 0; > @@ -1975,6 +1975,24 @@ static int vcn_v4_0_set_interrupt_state(struct > amdgpu_device *adev, struct amdgp > return 0; > } > > +/** > + * vcn_v4_0_set_ras_interrupt_state - set VCN block RAS interrupt > +state > + * > + * @adev: amdgpu_device pointer > + * @source: interrupt sources > + * @type: interrupt types > + * @state: interrupt states > + * > + * Set VCN block RAS interrupt state > + */ > +static int vcn_v4_0_set_ras_interrupt_state(struct amdgpu_device *adev, > + struct amdgpu_irq_src *source, > + unsigned int type, > + enum amdgpu_interrupt_state state) > +{ > + return 0; > +} > + > /** > * vcn_v4_0_process_interrupt - process VCN block interrupt > * > @@ -2007,9 +2025,6 @@ static int vcn_v4_0_process_interrupt(struct > amdgpu_device *adev, struct amdgpu_ > case VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE: > amdgpu_fence_process(&adev- > >vcn.inst[ip_instance].ring_enc[0]); > break; > - case VCN_4_0__SRCID_UVD_POISON: > - amdgpu_vcn_process_poison_irq(adev, source, entry); > - break; > default: > DRM_ERROR("Unhandled interrupt: %d %d\n", > entry->src_id, entry->src_data[0]); @@ -2024,6 > +2039,11 @@ static const struct amdgpu_irq_src_funcs > +vcn_v4_0_irq_funcs = { > .process = vcn_v4_0_process_interrupt, }; > > +static const struct amdgpu_irq_src_funcs vcn_v4_0_ras_irq_funcs = { > + .set = vcn_v4_0_set_ras_interrupt_state, > + .process = amdgpu_vcn_process_poison_irq, }; > + > /** > * vcn_v4_0_set_irq_funcs - set VCN block interrupt irq functions > * > @@ -2041,6 +2061,9 @@ static void vcn_v4_0_set_irq_funcs(struct > amdgpu_device *adev) > > adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1; > adev->vcn.inst[i].irq.funcs = &vcn_v4_0_irq_funcs; > + > + adev->vcn.inst[i].ras_poison_irq.num_types = adev- > >vcn.num_enc_rings + 1; > + adev->vcn.inst[i].ras_poison_irq.funcs = > &vcn_v4_0_ras_irq_funcs; > } > } > > -- > 2.34.1