[AMD Official Use Only - AMD Internal Distribution Only] > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Sent: Tuesday, August 27, 2024 4:02 PM > To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le > <Le.Ma@xxxxxxx> > Subject: Re: [PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset > > > > On 8/22/2024 3:08 PM, Prike Liang wrote: > > Implement the compute pipe reset and driver will fallback to pipe > > reset when queue reset failed. > > > > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx> > > --- > > v3: Use the dev log and filer out the gfx9.4.4 pipe reset support. > > v2: Convert the GC logic instance to physical instance in the > > register accessing process and use the dev_* print to specify > > the failed device. > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 + > > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 154 > > +++++++++++++++++++++--- > > 2 files changed, 139 insertions(+), 20 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > index e28c1ebfa98f..d4d74ba2bc27 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs { > > uint32_t queue_type, uint32_t me_id, > > uint32_t pipe_id, uint32_t queue_id, > > uint32_t xcc_id, uint32_t vmid); > > + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring, > > + uint32_t queue_type, uint32_t me, > > + uint32_t pipe, uint32_t queue, > > + uint32_t xcc_id); > > + > > /* Packet sizes */ > > int set_resources_size; > > int map_queues_size; > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > index 2067f26d3a9d..aa0c76eed452 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct > amdgpu_device *adev, > > struct amdgpu_cu_info *cu_info); > > static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, > > int xcc_id); static void gfx_v9_4_3_xcc_unset_safe_mode(struct > > amdgpu_device *adev, int xcc_id); > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > > + uint32_t queue_type, uint32_t me, > > + uint32_t pipe, uint32_t queue, > > + uint32_t xcc_id); > > > > static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring, > > uint64_t queue_mask) > > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs > gfx_v9_4_3_kiq_pm4_funcs = { > > .kiq_query_status = gfx_v9_4_3_kiq_query_status, > > .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs, > > .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue, > > + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe, > > .set_resources_size = 8, > > .map_queues_size = 7, > > .unmap_queues_size = 6, > > @@ -3466,6 +3471,116 @@ static void gfx_v9_4_3_emit_wave_limit(struct > amdgpu_ring *ring, bool enable) > > } > > } > > > > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t > me, > > + uint32_t pipe, uint32_t queue, > > + uint32_t xcc_id) > > +{ > > + int i, r; > > + /* make sure dequeue is complete*/ > > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > > + mutex_lock(&adev->srbm_mutex); > > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); > > + for (i = 0; i < adev->usec_timeout; i++) { > > + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), > regCP_HQD_ACTIVE) & 1)) > > + break; > > + udelay(1); > > + } > > + if (i >= adev->usec_timeout) > > + r = -ETIMEDOUT; > > + else > > + r = 0; > > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > > + mutex_unlock(&adev->srbm_mutex); > > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > > + > > + return r; > > + > > +} > > + > > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev) > > +{ > > + /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as > well.*/ > > + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && > > + adev->gfx.mec_fw_version >= 0x0000009b) > > + return true; > > + else > > + dev_warn_once(adev->dev, "Please use the latest MEC > version to see > > +whether support pipe reset\n"); > > + > > + return false; > > +} > > + > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > > + uint32_t queue_type, uint32_t me, > > + uint32_t pipe, uint32_t queue, > > + uint32_t xcc_id) > > +{ > > + struct amdgpu_device *adev = kiq_ring->adev; > > + uint32_t reset_pipe, clean_pipe; > > + int r; > > + > > + if (!gfx_v9_4_3_pipe_reset_support(adev)) > > + return -EINVAL; > > + > > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > > + mutex_lock(&adev->srbm_mutex); > > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); > > MEC_CNTL is not a per-queue register. So not sure if selecting this makes > sense here. The mutex may be taken so that resets for other queues in the > same XCC don't run in parallel. >