Re: [PATCH v4] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

Alex Deucher <alexdeucher@xxxxxxxxx> · Wed, 28 Aug 2024 10:52:26 -0400

On Wed, Aug 28, 2024 at 3:17 AM Liang, Prike <Prike.Liang@xxxxxxx> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> > Sent: Wednesday, August 28, 2024 2:45 PM
> > To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le
> > <Le.Ma@xxxxxxx>
> > Subject: Re: [PATCH v4] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
> >
> >
> >
> > On 8/28/2024 10:50 AM, Prike Liang wrote:
> > > Implement the compute pipe reset, and the driver will fallback to pipe
> > > reset when queue reset fails.
> > > The pipe reset only deactivates the queue which is scheduled in the
> > > pipe, and meanwhile the MEC engine will be reset to the firmware
> > > _start pointer. So, it seems pipe reset will cost more cycles than the
> > > queue reset; therefore, the driver tries to recover by doing queue
> > > reset first.
> > >
> > > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
> > >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 139
> > > ++++++++++++++++++++----
> > >  2 files changed, 124 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > > index e28c1ebfa98f..d4d74ba2bc27 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> > >                                uint32_t queue_type, uint32_t me_id,
> > >                                uint32_t pipe_id, uint32_t queue_id,
> > >                                uint32_t xcc_id, uint32_t vmid);
> > > +   int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> > > +                              uint32_t queue_type, uint32_t me,
> > > +                              uint32_t pipe, uint32_t queue,
> > > +                              uint32_t xcc_id);
> >
> > Missed the addition of this callback in earlier review.
> >
> > The implementation below -
> >       Doesn't use kiq to do a pipe reset. It's looks like a direct hardware
> > reset. Passing a kiq_ring here or defining a callback in kiq  functions doesn't
> > look required unless a pipe reset through kiq is available for other hardware
> > generations.
> >
> >       Also, it uses pipe reset as a fallback when queue unmap fails. So the
> > callback eventually is not used.
> >
> > Is this really needed? For the below implementation, it seems a private
> > function like gfx_v9_4_3_reset_hw_pipe(struct amdgpu_ring *ring) is good
> > enough.
> >
> > Thanks,
> > Lijo
> >
> This KIQ callback is implemented following Alex's software design spec. Maybe original design purpose was design to support the compute user queue.
> But IIRC the compute user queue pipe reset has a similar implementation in the KFD and may not reuse this callback.
>
> Hi, @Deucher, Alexander Could you help comment here and do we need to implement pipe reset in the KIQ callback?

I had originally thought we should use KIQ, but after further
discussions with the HW and FW teams, we ended up using MMIO so I'm
fine to drop the KIQ callback.

Alex

>
> Thanks,
> Prike
> > > +
> > >     /* Packet sizes */
> > >     int set_resources_size;
> > >     int map_queues_size;
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > > index 2067f26d3a9d..f47b55d6f673 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct
> > amdgpu_device *adev,
> > >                             struct amdgpu_cu_info *cu_info);
> > >  static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev,
> > > int xcc_id);  static void gfx_v9_4_3_xcc_unset_safe_mode(struct
> > > amdgpu_device *adev, int xcc_id);
> > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > > +                                   uint32_t queue_type, uint32_t me,
> > > +                                   uint32_t pipe, uint32_t queue,
> > > +                                   uint32_t xcc_id);
> > >
> > >  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> > >                             uint64_t queue_mask)
> > > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs
> > gfx_v9_4_3_kiq_pm4_funcs = {
> > >     .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> > >     .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> > >     .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> > > +   .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> > >     .set_resources_size = 8,
> > >     .map_queues_size = 7,
> > >     .unmap_queues_size = 6,
> > > @@ -3466,6 +3471,101 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> > amdgpu_ring *ring, bool enable)
> > >     }
> > >  }
> > >
> > > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t
> > me,
> > > +                           uint32_t pipe, uint32_t queue,
> > > +                           uint32_t xcc_id)
> > > +{
> > > +   int i, r;
> > > +   /* make sure dequeue is complete*/
> > > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > > +   mutex_lock(&adev->srbm_mutex);
> > > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > > +   for (i = 0; i < adev->usec_timeout; i++) {
> > > +           if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> > regCP_HQD_ACTIVE) & 1))
> > > +                   break;
> > > +           udelay(1);
> > > +   }
> > > +   if (i >= adev->usec_timeout)
> > > +           r = -ETIMEDOUT;
> > > +   else
> > > +           r = 0;
> > > +   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > > +   mutex_unlock(&adev->srbm_mutex);
> > > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > > +
> > > +   return r;
> > > +
> > > +}
> > > +
> > > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > > +{
> > > +   /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as
> > well.*/
> > > +   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> > > +                   adev->gfx.mec_fw_version >= 0x0000009b)
> > > +           return true;
> > > +   else
> > > +           dev_warn_once(adev->dev, "Please use the latest MEC
> > version to see
> > > +whether support pipe reset\n");
> > > +
> > > +   return false;
> > > +}
> > > +
> > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > > +                                   uint32_t queue_type, uint32_t me,
> > > +                                   uint32_t pipe, uint32_t queue,
> > > +                                   uint32_t xcc_id)
> > > +{
> > > +   struct amdgpu_device *adev = kiq_ring->adev;
> > > +   uint32_t reset_pipe, clean_pipe;
> > > +   int r;
> > > +
> > > +   if (!gfx_v9_4_3_pipe_reset_support(adev))
> > > +           return -EINVAL;
> > > +
> > > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > > +   mutex_lock(&adev->srbm_mutex);
> > > +
> > > +   reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> > regCP_MEC_CNTL);
> > > +   clean_pipe = reset_pipe;
> > > +
> > > +   if (me == 1) {
> > > +           switch (pipe) {
> > > +           case 0:
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME1_PIPE0_RESET, 1);
> > > +                   break;
> > > +           case 1:
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME1_PIPE1_RESET, 1);
> > > +                   break;
> > > +           case 2:
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME1_PIPE2_RESET, 1);
> > > +                   break;
> > > +           case 3:
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME1_PIPE3_RESET, 1);
> > > +                   break;
> > > +           default:
> > > +                   break;
> > > +           }
> > > +   } else {
> > > +           if (pipe)
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME2_PIPE1_RESET, 1);
> > > +           else
> > > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> > CP_MEC_CNTL,
> > > +                                              MEC_ME2_PIPE0_RESET, 1);
> > > +   }
> > > +
> > > +   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> > reset_pipe);
> > > +   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> > clean_pipe);
> > > +   mutex_unlock(&adev->srbm_mutex);
> > > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > > +
> > > +   r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> > > +   return r;
> > > +}
> > > +
> > >  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> > >                             unsigned int vmid)
> > >  {
> > > @@ -3473,7 +3573,7 @@ static int gfx_v9_4_3_reset_kcq(struct
> > amdgpu_ring *ring,
> > >     struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> > >     struct amdgpu_ring *kiq_ring = &kiq->ring;
> > >     unsigned long flags;
> > > -   int r, i;
> > > +   int r;
> > >
> > >     if (amdgpu_sriov_vf(adev))
> > >             return -EINVAL;
> > > @@ -3495,26 +3595,25 @@ static int gfx_v9_4_3_reset_kcq(struct
> > amdgpu_ring *ring,
> > >     spin_unlock_irqrestore(&kiq->ring_lock, flags);
> > >
> > >     r = amdgpu_ring_test_ring(kiq_ring);
> > > -   if (r)
> > > -           return r;
> > > -
> > > -   /* make sure dequeue is complete*/
> > > -   amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> > > -   mutex_lock(&adev->srbm_mutex);
> > > -   soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0,
> > GET_INST(GC, ring->xcc_id));
> > > -   for (i = 0; i < adev->usec_timeout; i++) {
> > > -           if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> > > -                   break;
> > > -           udelay(1);
> > > -   }
> > > -   if (i >= adev->usec_timeout)
> > > -           r = -ETIMEDOUT;
> > > -   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> > > -   mutex_unlock(&adev->srbm_mutex);
> > > -   amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> > >     if (r) {
> > > -           dev_err(adev->dev, "fail to wait on hqd deactive\n");
> > > -           return r;
> > > +           dev_err(adev->dev, "kiq ring test failed after ring: %s queue
> > reset\n",
> > > +                           ring->name);
> > > +           goto pipe_reset;
> > > +   }
> > > +
> > > +   r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring-
> > >queue, ring->xcc_id);
> > > +   if (r)
> > > +           dev_err(adev->dev, "fail to wait on hqd deactive and will try
> > pipe
> > > +reset\n");
> > > +
> > > +pipe_reset:
> > > +   if(r) {
> > > +           r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs-
> > >type,
> > > +                                           ring->me, ring->pipe,
> > > +                                           ring->queue, ring->xcc_id);
> > > +           dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
> > > +                           r ? "failed" : "successfully");
> > > +           if (r)
> > > +                   return r;
> > >     }
> > >
> > >     r = amdgpu_bo_reserve(ring->mqd_obj, false);