Re: [PATCH 2/6] drm/amdgpu/gfx11: Enable bad opcode interrupt

Alex Deucher <alexdeucher@xxxxxxxxx> · Mon, 22 Jul 2024 09:58:46 -0400

On Mon, Jul 22, 2024 at 9:55 AM Christian König
<ckoenig.leichtzumerken@xxxxxxxxx> wrote:
>
> Am 17.07.24 um 22:40 schrieb Alex Deucher:
> > From: Jesse Zhang <jesse.zhang@xxxxxxx>
> >
> > For the bad opcode case, it will cause CP/ME hang.
> > The firmware will prevent the ME side from hanging by raising a bad opcode interrupt.
> > And the driver needs to perform a vmid reset when receiving the interrupt.
> >
> > v2: update irq naming (drop priv) (Alex)
> >
> > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx>
> > Reviewed-by: Prike Liang <Prike.Liang@xxxxxxx>
> > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 73 ++++++++++++++++++++++++++
> >   1 file changed, 73 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 02efa475eb7e..ce5cb60b8628 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1569,6 +1569,13 @@ static int gfx_v11_0_sw_init(void *handle)
> >       if (r)
> >               return r;
> >
> > +     /* Bad opcode Event */
> > +     r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
> > +                           GFX_11_0_0__SRCID__CP_BAD_OPCODE_ERROR,
> > +                           &adev->gfx.bad_op_irq);
> > +     if (r)
> > +             return r;
> > +
> >       /* Privileged reg */
> >       r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
> >                             GFX_11_0_0__SRCID__CP_PRIV_REG_FAULT,
> > @@ -4646,6 +4653,7 @@ static int gfx_v11_0_hw_fini(void *handle)
> >
> >       amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
> >       amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
> > +     amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
> >
> >       if (!adev->no_hw_access) {
> >               if (amdgpu_async_gfx_ring) {
> > @@ -5002,6 +5010,9 @@ static int gfx_v11_0_late_init(void *handle)
> >       if (r)
> >               return r;
> >
> > +     r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
> > +     if (r)
> > +             return r;
> >       return 0;
> >   }
> >
> > @@ -6293,6 +6304,51 @@ static int gfx_v11_0_set_priv_reg_fault_state(struct amdgpu_device *adev,
> >       return 0;
> >   }
> >
> > +static int gfx_v11_0_set_bad_op_fault_state(struct amdgpu_device *adev,
> > +                                         struct amdgpu_irq_src *source,
> > +                                         unsigned type,
> > +                                         enum amdgpu_interrupt_state state)
> > +{
> > +     u32 cp_int_cntl_reg, cp_int_cntl;
> > +     int i , j;
> > +
> > +     switch (state) {
> > +     case AMDGPU_IRQ_STATE_DISABLE:
> > +     case AMDGPU_IRQ_STATE_ENABLE:
>
> That switch is pretty pointless since state can only be disabled or enabled.
>
> Most likely just c&p from an older version of the code and at some point
> lost it's relevance.
>
> Apart from that the series looks good to me.

Yeah, all of the other irq functions follow that same model.  If you
feel strongly, I can change it.

Alex

>
> Regards,
> Christian.
>
> > +             for (i = 0; i < adev->gfx.me.num_me; i++) {
> > +                     for (j = 0; j < adev->gfx.me.num_pipe_per_me; j++) {
> > +                             cp_int_cntl_reg = gfx_v11_0_get_cpg_int_cntl(adev, i, j);
> > +
> > +                             if (cp_int_cntl_reg) {
> > +                                     cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg);
> > +                                     cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0,
> > +                                                                 OPCODE_ERROR_INT_ENABLE,
> > +                                                                 state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
> > +                                     WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl);
> > +                             }
> > +                     }
> > +             }
> > +             for (i = 0; i < adev->gfx.mec.num_mec; i++) {
> > +                     for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) {
> > +                             /* MECs start at 1 */
> > +                             cp_int_cntl_reg = gfx_v11_0_get_cpc_int_cntl(adev, i + 1, j);
> > +
> > +                             if (cp_int_cntl_reg) {
> > +                                     cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg);
> > +                                     cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL,
> > +                                                                 OPCODE_ERROR_INT_ENABLE,
> > +                                                                 state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
> > +                                     WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl);
> > +                             }
> > +                     }
> > +             }
> > +             break;
> > +     default:
> > +             break;
> > +     }
> > +     return 0;
> > +}
> > +
> >   static int gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
> >                                              struct amdgpu_irq_src *source,
> >                                              unsigned int type,
> > @@ -6369,6 +6425,15 @@ static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev,
> >       return 0;
> >   }
> >
> > +static int gfx_v11_0_bad_op_irq(struct amdgpu_device *adev,
> > +                             struct amdgpu_irq_src *source,
> > +                             struct amdgpu_iv_entry *entry)
> > +{
> > +     DRM_ERROR("Illegal opcode in command stream \n");
> > +     gfx_v11_0_handle_priv_fault(adev, entry);
> > +     return 0;
> > +}
> > +
> >   static int gfx_v11_0_priv_inst_irq(struct amdgpu_device *adev,
> >                                  struct amdgpu_irq_src *source,
> >                                  struct amdgpu_iv_entry *entry)
> > @@ -6747,6 +6812,11 @@ static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_reg_irq_funcs = {
> >       .process = gfx_v11_0_priv_reg_irq,
> >   };
> >
> > +static const struct amdgpu_irq_src_funcs gfx_v11_0_bad_op_irq_funcs = {
> > +     .set = gfx_v11_0_set_bad_op_fault_state,
> > +     .process = gfx_v11_0_bad_op_irq,
> > +};
> > +
> >   static const struct amdgpu_irq_src_funcs gfx_v11_0_priv_inst_irq_funcs = {
> >       .set = gfx_v11_0_set_priv_inst_fault_state,
> >       .process = gfx_v11_0_priv_inst_irq,
> > @@ -6764,6 +6834,9 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
> >       adev->gfx.priv_reg_irq.num_types = 1;
> >       adev->gfx.priv_reg_irq.funcs = &gfx_v11_0_priv_reg_irq_funcs;
> >
> > +     adev->gfx.bad_op_irq.num_types = 1;
> > +     adev->gfx.bad_op_irq.funcs = &gfx_v11_0_bad_op_irq_funcs;
> > +
> >       adev->gfx.priv_inst_irq.num_types = 1;
> >       adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
> >
>