RE: [PATCH] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

"Liang, Prike" <Prike.Liang@xxxxxxx> · Tue, 20 Aug 2024 10:17:46 +0000



[AMD Official Use Only - AMD Internal Distribution Only]

> From: Ma, Le <Le.Ma@xxxxxxx>
> Sent: Tuesday, August 20, 2024 5:38 PM
> To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>
> Subject: RE: [PATCH] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
> > Prike Liang
> > Sent: Tuesday, August 20, 2024 4:58 PM
> > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Liang, Prike
> > <Prike.Liang@xxxxxxx>
> > Subject: [PATCH] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
> >
> > Implement the compute pipe reset and driver will fallback to pipe
> > reset when queue reset failed.
> >
> > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153
> > ++++++++++++++++++++----
> >  2 files changed, 138 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index e28c1ebfa98f..d4d74ba2bc27 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> >                                  uint32_t queue_type, uint32_t me_id,
> >                                  uint32_t pipe_id, uint32_t queue_id,
> >                                  uint32_t xcc_id, uint32_t vmid);
> > +     int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> > +                                uint32_t queue_type, uint32_t me,
> > +                                uint32_t pipe, uint32_t queue,
> > +                                uint32_t xcc_id);
> > +
> >       /* Packet sizes */
> >       int set_resources_size;
> >       int map_queues_size;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 2067f26d3a9d..09caa5a1842b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct
> > amdgpu_device *adev,
> >                               struct amdgpu_cu_info *cu_info);  static
> > void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int
> > xcc_id);  static void gfx_v9_4_3_xcc_unset_safe_mode(struct
> > amdgpu_device *adev, int xcc_id);
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                     uint32_t queue_type, uint32_t me,
> > +                                     uint32_t pipe, uint32_t queue,
> > +                                     uint32_t xcc_id);
> >
> >  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> >                               uint64_t queue_mask) @@ -323,6 +327,7 @@
> > static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = {
> >       .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> >       .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> >       .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> > +     .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> >       .set_resources_size = 8,
> >       .map_queues_size = 7,
> >       .unmap_queues_size = 6,
> > @@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> > amdgpu_ring *ring, bool enable)
> >       }
> >  }
> >
> > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t
> me,
> > +                             uint32_t pipe, uint32_t queue,
> > +                             uint32_t xcc_id) {
> > +     int i, r;
> > +     /* make sure dequeue is complete*/
> > +     gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +     mutex_lock(&adev->srbm_mutex);
> > +     soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > +     for (i = 0; i < adev->usec_timeout; i++) {
> > +             if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
>
> Should GET_INST be used to locate the target xcc_id here?
>
Thanks for pointing this out, it requires to covert the GC logical instance to physical instance on this ASIC and will address it in later version.

> > +                     break;
> > +             udelay(1);
> > +     }
> > +     if (i >= adev->usec_timeout)
> > +             r = -ETIMEDOUT;
> > +     else
> > +             r = 0;
> > +     soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +     mutex_unlock(&adev->srbm_mutex);
> > +     gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +     return r;
> > +
> > +}
> > +
> > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > +{
> > +
> > +     if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) {
> > +             DRM_WARN_ONCE("MEC firmware version too old, please use
> > FW no older than 155!\n");
> > +             return false;
> > +     }
> > +
> > +     return true;
> > +}
> > +
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                     uint32_t queue_type, uint32_t me,
> > +                                     uint32_t pipe, uint32_t queue,
> > +                                     uint32_t xcc_id) {
> > +     struct amdgpu_device *adev = kiq_ring->adev;
> > +     uint32_t reset_pipe, clean_pipe;
> > +     int r;
> > +
> > +     if (!gfx_v9_4_3_pipe_reset_support(adev))
> > +             return -EINVAL;
> > +
> > +     gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +     mutex_lock(&adev->srbm_mutex);
> > +     soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC,
> > + xcc_id));
> > +
> > +     reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
>
> Here the xcc_id is also missed to address.
>
> > +     clean_pipe = reset_pipe;
> > +
> > +     if (me == 1) {
> > +             switch (pipe) {
> > +             case 0:
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE0_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE0_RESET, 0);
> > +                     break;
> > +             case 1:
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE1_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE1_RESET, 0);
> > +                     break;
> > +             case 2:
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE2_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE2_RESET, 0);
> > +                     break;
> > +             case 3:
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE3_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME1_PIPE3_RESET, 0);
> > +                     break;
> > +             default:
> > +                     break;
> > +             }
> > +     } else {
> > +             if (pipe) {
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME2_PIPE1_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME2_PIPE1_RESET, 0);
> > +             } else {
> > +                     reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME2_PIPE0_RESET, 1);
> > +                     clean_pipe = REG_SET_FIELD(clean_pipe,
> > CP_MEC_CNTL,
> > +                                                MEC_ME2_PIPE0_RESET, 0);
> > +             }
> > +     }
> > +
> > +     WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> > reset_pipe);
> > +     WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> > clean_pipe);
> > +     soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +     mutex_unlock(&adev->srbm_mutex);
> > +     gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +     r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> > +     return r;
> > +}
> > +
> >  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> >                               unsigned int vmid)  { @@ -3473,7 +3587,7
> > @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> >       struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> >       struct amdgpu_ring *kiq_ring = &kiq->ring;
> >       unsigned long flags;
> > -     int r, i;
> > +     int r;
> >
> >       if (amdgpu_sriov_vf(adev))
> >               return -EINVAL;
> > @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct
> > amdgpu_ring *ring,
> >       spin_unlock_irqrestore(&kiq->ring_lock, flags);
> >
> >       r = amdgpu_ring_test_ring(kiq_ring);
> > -     if (r)
> > -             return r;
> > -
> > -     /* make sure dequeue is complete*/
> > -     amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> > -     mutex_lock(&adev->srbm_mutex);
> > -     soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0,
> > GET_INST(GC, ring->xcc_id));
> > -     for (i = 0; i < adev->usec_timeout; i++) {
> > -             if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> > -                     break;
> > -             udelay(1);
> > -     }
> > -     if (i >= adev->usec_timeout)
> > -             r = -ETIMEDOUT;
> > -     soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> > -     mutex_unlock(&adev->srbm_mutex);
> > -     amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> >       if (r) {
> > -             dev_err(adev->dev, "fail to wait on hqd deactive\n");
> > -             return r;
> > +             DRM_ERROR("kiq ring test failed after ring: %s queue reset\n",
> > +                             ring->name);
>
> Normally the dev_xxx print is used to trace the socket number where error
> happens.
>
> Thanks.
>
> > +             goto pipe_reset;
> > +     }
> > +
> > +     r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe,
> > + ring->queue,
> > ring->xcc_id);
> > +     if (r)
> > +             DRM_ERROR("fail to wait on hqd deactive and will try
> > + pipe
> > reset\n");
> > +
> > +pipe_reset:
> > +     if(r) {
> > +             r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
> > +                                             ring->me, ring->pipe,
> > +                                             ring->queue, ring->xcc_id);
> > +             DRM_INFO("ring: %s pipe reset :%s\n", ring->name,
> > +                             r ? "failed" : "successfully");
> > +             if (r)
> > +                     return r;
> >       }
> >
> >       r = amdgpu_bo_reserve(ring->mqd_obj, false);
> > --
> > 2.34.1
>