On Mon, Jul 22, 2024 at 9:55 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > Am 17.07.24 um 22:40 schrieb Alex Deucher: > > From: Jesse Zhang <jesse.zhang@xxxxxxx> > > > > For the bad opcode case, it will cause CP/ME hang. > > The firmware will prevent the ME side from hanging by raising a bad opcode interrupt. > > And the driver needs to perform a vmid reset when receiving the interrupt. > > > > v2: update irq naming (drop priv) (Alex) > > > > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx> > > Reviewed-by: Prike Liang <Prike.Liang@xxxxxxx> > > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > > Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 73 ++++++++++++++++++++++++++ > > 1 file changed, 73 insertions(+) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > index 02efa475eb7e..ce5cb60b8628 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > @@ -1569,6 +1569,13 @@ static int gfx_v11_0_sw_init(void *handle) > > if (r) > > return r; > > > > + /* Bad opcode Event */ > > + r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP, > > + GFX_11_0_0__SRCID__CP_BAD_OPCODE_ERROR, > > + &adev->gfx.bad_op_irq); > > + if (r) > > + return r; > > + > > /* Privileged reg */ > > r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP, > > GFX_11_0_0__SRCID__CP_PRIV_REG_FAULT, > > @@ -4646,6 +4653,7 @@ static int gfx_v11_0_hw_fini(void *handle) > > > > amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); > > amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); > > + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); > > > > if (!adev->no_hw_access) { > > if (amdgpu_async_gfx_ring) { > > @@ -5002,6 +5010,9 @@ static int gfx_v11_0_late_init(void *handle) > > if (r) > > return r; > > > > + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); > > + if (r) > > + return r; > > return 0; > > } > > > > @@ -6293,6 +6304,51 @@ static int gfx_v11_0_set_priv_reg_fault_state(struct amdgpu_device *adev, > > return 0; > > } > > > > +static int gfx_v11_0_set_bad_op_fault_state(struct amdgpu_device *adev, > > + struct amdgpu_irq_src *source, > > + unsigned type, > > + enum amdgpu_interrupt_state state) > > +{ > > + u32 cp_int_cntl_reg, cp_int_cntl; > > + int i , j; > > + > > + switch (state) { > > + case AMDGPU_IRQ_STATE_DISABLE: > > + case AMDGPU_IRQ_STATE_ENABLE: > > That switch is pretty pointless since state can only be disabled or enabled. > > Most likely just c&p from an older version of the code and at some point > lost it's relevance. > > Apart from that the series looks good to me. Yeah, all of the other irq functions follow that same model. If you feel strongly, I can change it. Alex > > Regards, > Christian. > > > + for (i = 0; i < adev->gfx.me.num_me; i++) { > > + for (j = 0; j < adev->gfx.me.num_pipe_per_me; j++) { > > + cp_int_cntl_reg = gfx_v11_0_get_cpg_int_cntl(adev, i, j); > > + > > + if (cp_int_cntl_reg) { > > + cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg); > > + cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0, > > + OPCODE_ERROR_INT_ENABLE, > > + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); > > + WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl); > > + } > > + } > > + } > > + for (i = 0; i < adev->gfx.mec.num_mec; i++) { > > + for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) { > > + /* MECs start at 1 */ > > + cp_int_cntl_reg = gfx_v11_0_get_cpc_int_cntl(adev, i + 1, j); > > + > > + if (cp_int_cntl_reg) { > > + cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg); > > + cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL, > > + OPCODE_ERROR_INT_ENABLE, > > + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); > > + WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl); > > + } > > + } > > + } > > + break; > > + default: > > + break; > > + } > > + return 0; > > +} > > + > > static int gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev, > > struct amdgpu_irq_src *source, > > unsigned int type, > > @@ -6369,6 +6425,15 @@ static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev, > > return 0; > > } > > > > +static int gfx_v11_0_bad_op_irq(struct amdgpu_device *adev, > > + struct amdgpu_irq_src *source, > > + struct amdgpu_iv_entry *entry) > > +{ > > + DRM_ERROR("Illegal opcode in command stream \n"); > > + gfx_v11_0_handle_priv_fault(adev, entry); > > + return 0; > > +} > > + > > static int gfx_v11_0_priv_inst_irq(struct amdgpu_device *adev, > > struct amdgpu_irq_src *source, > > struct amdgpu_iv_entry *entry) > > @@ -6747,6 +6812,11 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_reg_irq_funcs = { > > .process = gfx_v11_0_priv_reg_irq, > > }; > > > > +static const struct amdgpu_irq_src_funcs gfx_v11_0_bad_op_irq_funcs = { > > + .set = gfx_v11_0_set_bad_op_fault_state, > > + .process = gfx_v11_0_bad_op_irq, > > +}; > > + > > static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = { > > .set = gfx_v11_0_set_priv_inst_fault_state, > > .process = gfx_v11_0_priv_inst_irq, > > @@ -6764,6 +6834,9 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev) > > adev->gfx.priv_reg_irq.num_types = 1; > > adev->gfx.priv_reg_irq.funcs = &gfx_v11_0_priv_reg_irq_funcs; > > > > + adev->gfx.bad_op_irq.num_types = 1; > > + adev->gfx.bad_op_irq.funcs = &gfx_v11_0_bad_op_irq_funcs; > > + > > adev->gfx.priv_inst_irq.num_types = 1; > > adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs; > > >