RE: [PATCH v5] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

"Liang, Prike" <Prike.Liang@xxxxxxx> · Thu, 29 Aug 2024 08:50:07 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

Thanks for the detail review, and I will update those before pushing the commit.

Thanks,
Prike

> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> Sent: Thursday, August 29, 2024 4:13 PM
> To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le
> <Le.Ma@xxxxxxx>
> Subject: Re: [PATCH v5] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
>
>
>
> On 8/29/2024 9:17 AM, Prike Liang wrote:
> > Implement the compute pipe reset, and the driver will fallback to pipe
> > reset when queue reset fails.
> > The pipe reset only deactivates the queue which is scheduled in the
> > pipe, and meanwhile the MEC engine will be reset to the firmware
> > _start pointer. So,
>
> May refine this to indicate that reset to _start is for the specific pipe and not
> applicable for the whole MEC engine.
>
> > it seems pipe reset will cost more cycles than the queue reset;
> > therefore, the driver tries to recover by doing queue reset first.
> >
> > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 127
> > ++++++++++++++++++++----
> >  1 file changed, 108 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 2067f26d3a9d..26ae62d2a752 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -3466,6 +3466,98 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> amdgpu_ring *ring, bool enable)
> >     }
> >  }
> >
> > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t
> me,
> > +                           uint32_t pipe, uint32_t queue,
> > +                           uint32_t xcc_id)
> > +{
> > +   int i, r;
> > +   /* make sure dequeue is complete*/
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > +   for (i = 0; i < adev->usec_timeout; i++) {
> > +           if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_HQD_ACTIVE) & 1))
> > +                   break;
> > +           udelay(1);
> > +   }
> > +   if (i >= adev->usec_timeout)
> > +           r = -ETIMEDOUT;
> > +   else
> > +           r = 0;
> > +   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +   mutex_unlock(&adev->srbm_mutex);
> > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +   return r;
> > +
> > +}
> > +
> > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > +{
> > +   /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as
> well.*/
> > +   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> > +                   adev->gfx.mec_fw_version >= 0x0000009b)
> > +           return true;
> > +   else
> > +           dev_warn_once(adev->dev, "Please use the latest MEC
> version to see
> > +whether support pipe reset\n");
> > +
> > +   return false;
> > +}
> > +
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *ring)
>
> Please drop the kiq name in this function to avoid confusion. It's not
> restricted to kiq.
>
> With those
>
>       Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>
>
> Thanks,
> Lijo
>
> > +{
> > +   struct amdgpu_device *adev = ring->adev;
> > +   uint32_t reset_pipe, clean_pipe;
> > +   int r;
> > +
> > +   if (!gfx_v9_4_3_pipe_reset_support(adev))
> > +           return -EINVAL;
> > +
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, ring->xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +
> > +   reset_pipe = RREG32_SOC15(GC, GET_INST(GC, ring->xcc_id),
> regCP_MEC_CNTL);
> > +   clean_pipe = reset_pipe;
> > +
> > +   if (ring->me == 1) {
> > +           switch (ring->pipe) {
> > +           case 0:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE0_RESET, 1);
> > +                   break;
> > +           case 1:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE1_RESET, 1);
> > +                   break;
> > +           case 2:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE2_RESET, 1);
> > +                   break;
> > +           case 3:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE3_RESET, 1);
> > +                   break;
> > +           default:
> > +                   break;
> > +           }
> > +   } else {
> > +           if (ring->pipe)
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE1_RESET, 1);
> > +           else
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE0_RESET, 1);
> > +   }
> > +
> > +   WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL,
> reset_pipe);
> > +   WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL,
> clean_pipe);
> > +   mutex_unlock(&adev->srbm_mutex);
> > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, ring->xcc_id);
> > +
> > +   r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring-
> >queue, ring->xcc_id);
> > +   return r;
> > +}
> > +
> >  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> >                             unsigned int vmid)
> >  {
> > @@ -3473,7 +3565,7 @@ static int gfx_v9_4_3_reset_kcq(struct
> amdgpu_ring *ring,
> >     struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> >     struct amdgpu_ring *kiq_ring = &kiq->ring;
> >     unsigned long flags;
> > -   int r, i;
> > +   int r;
> >
> >     if (amdgpu_sriov_vf(adev))
> >             return -EINVAL;
> > @@ -3495,26 +3587,23 @@ static int gfx_v9_4_3_reset_kcq(struct
> amdgpu_ring *ring,
> >     spin_unlock_irqrestore(&kiq->ring_lock, flags);
> >
> >     r = amdgpu_ring_test_ring(kiq_ring);
> > +   if (r) {
> > +           dev_err(adev->dev, "kiq ring test failed after ring: %s queue
> reset\n",
> > +                           ring->name);
> > +           goto pipe_reset;
> > +   }
> > +
> > +   r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring-
> >queue,
> > +ring->xcc_id);
> >     if (r)
> > -           return r;
> > +           dev_err(adev->dev, "fail to wait on hqd deactive and will try
> pipe
> > +reset\n");
> >
> > -   /* make sure dequeue is complete*/
> > -   amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> > -   mutex_lock(&adev->srbm_mutex);
> > -   soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0,
> GET_INST(GC, ring->xcc_id));
> > -   for (i = 0; i < adev->usec_timeout; i++) {
> > -           if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> > -                   break;
> > -           udelay(1);
> > -   }
> > -   if (i >= adev->usec_timeout)
> > -           r = -ETIMEDOUT;
> > -   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> > -   mutex_unlock(&adev->srbm_mutex);
> > -   amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> > -   if (r) {
> > -           dev_err(adev->dev, "fail to wait on hqd deactive\n");
> > -           return r;
> > +pipe_reset:
> > +   if(r) {
> > +           r = gfx_v9_4_3_kiq_reset_hw_pipe(ring);
> > +           dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
> > +                           r ? "failed" : "successfully");
> > +           if (r)
> > +                   return r;
> >     }
> >
> >     r = amdgpu_bo_reserve(ring->mqd_obj, false);