[AMD Official Use Only - AMD Internal Distribution Only] Thanks for the detail review, and I will update those before pushing the commit. Thanks, Prike > -----Original Message----- > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Sent: Thursday, August 29, 2024 4:13 PM > To: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le > <Le.Ma@xxxxxxx> > Subject: Re: [PATCH v5] drm/amdgpu/gfx9.4.3: Implement compute pipe reset > > > > On 8/29/2024 9:17 AM, Prike Liang wrote: > > Implement the compute pipe reset, and the driver will fallback to pipe > > reset when queue reset fails. > > The pipe reset only deactivates the queue which is scheduled in the > > pipe, and meanwhile the MEC engine will be reset to the firmware > > _start pointer. So, > > May refine this to indicate that reset to _start is for the specific pipe and not > applicable for the whole MEC engine. > > > it seems pipe reset will cost more cycles than the queue reset; > > therefore, the driver tries to recover by doing queue reset first. > > > > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 127 > > ++++++++++++++++++++---- > > 1 file changed, 108 insertions(+), 19 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > index 2067f26d3a9d..26ae62d2a752 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > @@ -3466,6 +3466,98 @@ static void gfx_v9_4_3_emit_wave_limit(struct > amdgpu_ring *ring, bool enable) > > } > > } > > > > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t > me, > > + uint32_t pipe, uint32_t queue, > > + uint32_t xcc_id) > > +{ > > + int i, r; > > + /* make sure dequeue is complete*/ > > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > > + mutex_lock(&adev->srbm_mutex); > > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); > > + for (i = 0; i < adev->usec_timeout; i++) { > > + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), > regCP_HQD_ACTIVE) & 1)) > > + break; > > + udelay(1); > > + } > > + if (i >= adev->usec_timeout) > > + r = -ETIMEDOUT; > > + else > > + r = 0; > > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > > + mutex_unlock(&adev->srbm_mutex); > > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > > + > > + return r; > > + > > +} > > + > > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev) > > +{ > > + /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as > well.*/ > > + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && > > + adev->gfx.mec_fw_version >= 0x0000009b) > > + return true; > > + else > > + dev_warn_once(adev->dev, "Please use the latest MEC > version to see > > +whether support pipe reset\n"); > > + > > + return false; > > +} > > + > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *ring) > > Please drop the kiq name in this function to avoid confusion. It's not > restricted to kiq. > > With those > > Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> > > Thanks, > Lijo > > > +{ > > + struct amdgpu_device *adev = ring->adev; > > + uint32_t reset_pipe, clean_pipe; > > + int r; > > + > > + if (!gfx_v9_4_3_pipe_reset_support(adev)) > > + return -EINVAL; > > + > > + gfx_v9_4_3_xcc_set_safe_mode(adev, ring->xcc_id); > > + mutex_lock(&adev->srbm_mutex); > > + > > + reset_pipe = RREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), > regCP_MEC_CNTL); > > + clean_pipe = reset_pipe; > > + > > + if (ring->me == 1) { > > + switch (ring->pipe) { > > + case 0: > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME1_PIPE0_RESET, 1); > > + break; > > + case 1: > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME1_PIPE1_RESET, 1); > > + break; > > + case 2: > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME1_PIPE2_RESET, 1); > > + break; > > + case 3: > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME1_PIPE3_RESET, 1); > > + break; > > + default: > > + break; > > + } > > + } else { > > + if (ring->pipe) > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME2_PIPE1_RESET, 1); > > + else > > + reset_pipe = REG_SET_FIELD(reset_pipe, > CP_MEC_CNTL, > > + MEC_ME2_PIPE0_RESET, 1); > > + } > > + > > + WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, > reset_pipe); > > + WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, > clean_pipe); > > + mutex_unlock(&adev->srbm_mutex); > > + gfx_v9_4_3_xcc_unset_safe_mode(adev, ring->xcc_id); > > + > > + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring- > >queue, ring->xcc_id); > > + return r; > > +} > > + > > static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > > unsigned int vmid) > > { > > @@ -3473,7 +3565,7 @@ static int gfx_v9_4_3_reset_kcq(struct > amdgpu_ring *ring, > > struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id]; > > struct amdgpu_ring *kiq_ring = &kiq->ring; > > unsigned long flags; > > - int r, i; > > + int r; > > > > if (amdgpu_sriov_vf(adev)) > > return -EINVAL; > > @@ -3495,26 +3587,23 @@ static int gfx_v9_4_3_reset_kcq(struct > amdgpu_ring *ring, > > spin_unlock_irqrestore(&kiq->ring_lock, flags); > > > > r = amdgpu_ring_test_ring(kiq_ring); > > + if (r) { > > + dev_err(adev->dev, "kiq ring test failed after ring: %s queue > reset\n", > > + ring->name); > > + goto pipe_reset; > > + } > > + > > + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring- > >queue, > > +ring->xcc_id); > > if (r) > > - return r; > > + dev_err(adev->dev, "fail to wait on hqd deactive and will try > pipe > > +reset\n"); > > > > - /* make sure dequeue is complete*/ > > - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id); > > - mutex_lock(&adev->srbm_mutex); > > - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, > GET_INST(GC, ring->xcc_id)); > > - for (i = 0; i < adev->usec_timeout; i++) { > > - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) > > - break; > > - udelay(1); > > - } > > - if (i >= adev->usec_timeout) > > - r = -ETIMEDOUT; > > - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id)); > > - mutex_unlock(&adev->srbm_mutex); > > - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id); > > - if (r) { > > - dev_err(adev->dev, "fail to wait on hqd deactive\n"); > > - return r; > > +pipe_reset: > > + if(r) { > > + r = gfx_v9_4_3_kiq_reset_hw_pipe(ring); > > + dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name, > > + r ? "failed" : "successfully"); > > + if (r) > > + return r; > > } > > > > r = amdgpu_bo_reserve(ring->mqd_obj, false);