[AMD Official Use Only - AMD Internal Distribution Only] > From: Alex Deucher <alexdeucher@xxxxxxxxx> > Sent: Tuesday, August 20, 2024 9:50 PM > To: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Cc: Liang, Prike <Prike.Liang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; > Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le > <Le.Ma@xxxxxxx> > Subject: Re: [PATCH v2] drm/amdgpu/gfx9.4.3: Implement compute pipe reset > > On Tue, Aug 20, 2024 at 8:43 AM Lazar, Lijo <lijo.lazar@xxxxxxx> wrote: > > > > > > > > On 8/20/2024 4:01 PM, Prike Liang wrote: > > > Implement the compute pipe reset and driver will fallback to pipe > > > reset when queue reset failed. > > > > > > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx> > > > --- > > > v2: Convert the GC logic instance to physical instance in the > > > register accessing process and > > > > > use the dev_* print to specify the failed device. > > > > This is not fully done, marked below. > > > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 + > > > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 > > > ++++++++++++++++++++---- > > > 2 files changed, 138 insertions(+), 20 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > > index e28c1ebfa98f..d4d74ba2bc27 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs { > > > uint32_t queue_type, uint32_t me_id, > > > uint32_t pipe_id, uint32_t queue_id, > > > uint32_t xcc_id, uint32_t vmid); > > > + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring, > > > + uint32_t queue_type, uint32_t me, > > > + uint32_t pipe, uint32_t queue, > > > + uint32_t xcc_id); > > > + > > > /* Packet sizes */ > > > int set_resources_size; > > > int map_queues_size; > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > > index 2067f26d3a9d..ab9d5adbbfe8 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct > amdgpu_device *adev, > > > struct amdgpu_cu_info *cu_info); > > > static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, > > > int xcc_id); static void gfx_v9_4_3_xcc_unset_safe_mode(struct > > > amdgpu_device *adev, int xcc_id); > > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > > > + uint32_t queue_type, uint32_t me, > > > + uint32_t pipe, uint32_t queue, > > > + uint32_t xcc_id); > > > > > > static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring, > > > uint64_t queue_mask) @@ -323,6 +327,7 > > > @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = { > > > .kiq_query_status = gfx_v9_4_3_kiq_query_status, > > > .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs, > > > .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue, > > > + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe, > > > .set_resources_size = 8, > > > .map_queues_size = 7, > > > .unmap_queues_size = 6, > > > @@ -3466,6 +3471,115 @@ static void > gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable) > > > } > > > } > > > > > > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, > uint32_t me, > > > + uint32_t pipe, uint32_t queue, > > > + uint32_t xcc_id) { > > > + int i, r; > > > + /* make sure dequeue is complete*/ > > > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > > > + mutex_lock(&adev->srbm_mutex); > > > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); > > > + for (i = 0; i < adev->usec_timeout; i++) { > > > + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), > regCP_HQD_ACTIVE) & 1)) > > > + break; > > > + udelay(1); > > > + } > > > + if (i >= adev->usec_timeout) > > > + r = -ETIMEDOUT; > > > + else > > > + r = 0; > > > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > > > + mutex_unlock(&adev->srbm_mutex); > > > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > > > + > > > + return r; > > > + > > > +} > > > + > > > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device > > > +*adev) { > > > + > > > + if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) { > > > + DRM_WARN_ONCE("MEC firmware version too old, please use > FW no older than 155!\n"); > > > + return false; > > > + } > > > > This path will be taken GCv9.4.3 and GCv9.4.4. GCv9.4.4 has a > > different FW version. If FW is not yet ready for 9.4.4, better check > > that and return false for that. > > > > > + > > > + return true; > > > +} > > > + > > > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > > > + uint32_t queue_type, uint32_t me, > > > + uint32_t pipe, uint32_t queue, > > > + uint32_t xcc_id) { > > > + struct amdgpu_device *adev = kiq_ring->adev; > > > + uint32_t reset_pipe, clean_pipe; > > > + int r; > > > + > > > + if (!gfx_v9_4_3_pipe_reset_support(adev)) > > > + return -EINVAL; > > > + > > > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > > > + mutex_lock(&adev->srbm_mutex); > > > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, > > > + xcc_id)); > > > + > > > + reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), > regCP_MEC_CNTL); > > > + clean_pipe = reset_pipe; > > > + > > > + if (me == 1) { > > > + switch (pipe) { > > > + case 0: > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE0_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE0_RESET, 0); > > > + break; > > > + case 1: > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE1_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE1_RESET, 0); > > > + break; > > > + case 2: > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE2_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE2_RESET, 0); > > > + break; > > > + case 3: > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE3_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME1_PIPE3_RESET, 0); > > > + break; > > > + default: > > > + break; > > > + } > > > + } else { > > > + if (pipe) { > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME2_PIPE1_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME2_PIPE1_RESET, 0); > > > + } else { > > > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > > > + MEC_ME2_PIPE0_RESET, 1); > > > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > > > + MEC_ME2_PIPE0_RESET, 0); > > > + } > > > + } > > > + > > > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, > reset_pipe); > > > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, > clean_pipe); > > > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > > > + mutex_unlock(&adev->srbm_mutex); > > > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > > > + > > > + r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id); > > > + return r; > > > +} > > > + > > > static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > > > unsigned int vmid) { @@ -3473,7 > > > +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > > > struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id]; > > > struct amdgpu_ring *kiq_ring = &kiq->ring; > > > unsigned long flags; > > > - int r, i; > > > + int r; > > > > > > if (amdgpu_sriov_vf(adev)) > > > return -EINVAL; > > > @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct > amdgpu_ring *ring, > > > spin_unlock_irqrestore(&kiq->ring_lock, flags); > > > > > > r = amdgpu_ring_test_ring(kiq_ring); > > > - if (r) > > > - return r; > > > - > > > - /* make sure dequeue is complete*/ > > > - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id); > > > - mutex_lock(&adev->srbm_mutex); > > > - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, > GET_INST(GC, ring->xcc_id)); > > > - for (i = 0; i < adev->usec_timeout; i++) { > > > - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) > > > - break; > > > - udelay(1); > > > - } > > > - if (i >= adev->usec_timeout) > > > - r = -ETIMEDOUT; > > > - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id)); > > > - mutex_unlock(&adev->srbm_mutex); > > > - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id); > > > if (r) { > > > - dev_err(adev->dev, "fail to wait on hqd deactive\n"); > > > - return r; > > > + DRM_ERROR("kiq ring test failed after ring: %s queue reset\n", > > > + ring->name); > > > > dev_err here > > > > > + goto pipe_reset; > > > + } > > > + > > > + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, > ring->xcc_id); > > > + if (r) > > > + dev_err(adev->dev,"fail to wait on hqd deactive and > > > + will try pipe reset\n"); > > > + > > > +pipe_reset: > > > + if(r) { > > > + r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type, > > > + ring->me, ring->pipe, > > > + ring->queue, > > > +ring->xcc_id); > > > > A side question about this - what's the expectation about the other > > queue(s) on this pipe at this point? Are they expected to be also hung > > at this point? > > According to the firmware team, the pipe will only have one queue executing > on it at a given time, so resetting the pipe will only reset that queue. > > Alex > Yeah, by checking the register CP_HQD_ACTIVE before and after the pipe reset, it shows that the pipe reset only resets the active queue selected by the GRBM_GFX_CNTL setting, and the status of other queues does not change during the pipe reset. Thanks, Prike > > > > In short, checking if this should be done from a higher level code or > > from here itself. Ex: > > > > Queue reset -> Failed -> Try dequeue of all other active queues -> Do > > a pipe reset > > > > > + DRM_INFO("ring: %s pipe reset :%s\n", ring->name, > > > + r ? "failed" : "successfully"); > > dev_info here > > > > Thanks, > > Lijo > > > > > + if (r) > > > + return r; > > > } > > > > > > r = amdgpu_bo_reserve(ring->mqd_obj, false);