RE: [PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

"Liang, Prike" <Prike.Liang@xxxxxxx> · Wed, 28 Aug 2024 03:18:10 +0000



[AMD Official Use Only - AMD Internal Distribution Only]

> From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> Sent: Tuesday, August 27, 2024 4:02 PM
> To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le
> <Le.Ma@xxxxxxx>
> Subject: Re: [PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
>
>
>
> On 8/22/2024 3:08 PM, Prike Liang wrote:
> > Implement the compute pipe reset and driver will fallback to pipe
> > reset when queue reset failed.
> >
> > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> > ---
> > v3: Use the dev log and filer out the gfx9.4.4 pipe reset support.
> > v2: Convert the GC logic instance to physical instance in the
> >     register accessing process and use the dev_* print to specify
> >     the failed device.
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 154
> > +++++++++++++++++++++---
> >  2 files changed, 139 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index e28c1ebfa98f..d4d74ba2bc27 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> >                                uint32_t queue_type, uint32_t me_id,
> >                                uint32_t pipe_id, uint32_t queue_id,
> >                                uint32_t xcc_id, uint32_t vmid);
> > +   int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> > +                              uint32_t queue_type, uint32_t me,
> > +                              uint32_t pipe, uint32_t queue,
> > +                              uint32_t xcc_id);
> > +
> >     /* Packet sizes */
> >     int set_resources_size;
> >     int map_queues_size;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 2067f26d3a9d..aa0c76eed452 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct
> amdgpu_device *adev,
> >                             struct amdgpu_cu_info *cu_info);
> >  static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev,
> > int xcc_id);  static void gfx_v9_4_3_xcc_unset_safe_mode(struct
> > amdgpu_device *adev, int xcc_id);
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                   uint32_t queue_type, uint32_t me,
> > +                                   uint32_t pipe, uint32_t queue,
> > +                                   uint32_t xcc_id);
> >
> >  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> >                             uint64_t queue_mask)
> > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs
> gfx_v9_4_3_kiq_pm4_funcs = {
> >     .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> >     .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> >     .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> > +   .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> >     .set_resources_size = 8,
> >     .map_queues_size = 7,
> >     .unmap_queues_size = 6,
> > @@ -3466,6 +3471,116 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> amdgpu_ring *ring, bool enable)
> >     }
> >  }
> >
> > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t
> me,
> > +                           uint32_t pipe, uint32_t queue,
> > +                           uint32_t xcc_id)
> > +{
> > +   int i, r;
> > +   /* make sure dequeue is complete*/
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > +   for (i = 0; i < adev->usec_timeout; i++) {
> > +           if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_HQD_ACTIVE) & 1))
> > +                   break;
> > +           udelay(1);
> > +   }
> > +   if (i >= adev->usec_timeout)
> > +           r = -ETIMEDOUT;
> > +   else
> > +           r = 0;
> > +   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +   mutex_unlock(&adev->srbm_mutex);
> > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +   return r;
> > +
> > +}
> > +
> > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > +{
> > +   /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as
> well.*/
> > +   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> > +                   adev->gfx.mec_fw_version >= 0x0000009b)
> > +           return true;
> > +   else
> > +           dev_warn_once(adev->dev, "Please use the latest MEC
> version to see
> > +whether support pipe reset\n");
> > +
> > +   return false;
> > +}
> > +
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                   uint32_t queue_type, uint32_t me,
> > +                                   uint32_t pipe, uint32_t queue,
> > +                                   uint32_t xcc_id)
> > +{
> > +   struct amdgpu_device *adev = kiq_ring->adev;
> > +   uint32_t reset_pipe, clean_pipe;
> > +   int r;
> > +
> > +   if (!gfx_v9_4_3_pipe_reset_support(adev))
> > +           return -EINVAL;
> > +
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
>
> MEC_CNTL is not a per-queue register. So not sure if selecting this makes
> sense here. The mutex may be taken so that resets for other queues in the
> same XCC don't run in parallel.
>