On 8/22/2024 3:08 PM, Prike Liang wrote: > Implement the compute pipe reset and driver will > fallback to pipe reset when queue reset failed. > > Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx> > --- > v3: Use the dev log and filer out the gfx9.4.4 pipe reset support. > v2: Convert the GC logic instance to physical instance in the > register accessing process and use the dev_* print to specify > the failed device. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 154 +++++++++++++++++++++--- > 2 files changed, 139 insertions(+), 20 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index e28c1ebfa98f..d4d74ba2bc27 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs { > uint32_t queue_type, uint32_t me_id, > uint32_t pipe_id, uint32_t queue_id, > uint32_t xcc_id, uint32_t vmid); > + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring, > + uint32_t queue_type, uint32_t me, > + uint32_t pipe, uint32_t queue, > + uint32_t xcc_id); > + > /* Packet sizes */ > int set_resources_size; > int map_queues_size; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > index 2067f26d3a9d..aa0c76eed452 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, > struct amdgpu_cu_info *cu_info); > static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id); > static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id); > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > + uint32_t queue_type, uint32_t me, > + uint32_t pipe, uint32_t queue, > + uint32_t xcc_id); > > static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring, > uint64_t queue_mask) > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = { > .kiq_query_status = gfx_v9_4_3_kiq_query_status, > .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs, > .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue, > + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe, > .set_resources_size = 8, > .map_queues_size = 7, > .unmap_queues_size = 6, > @@ -3466,6 +3471,116 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable) > } > } > > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me, > + uint32_t pipe, uint32_t queue, > + uint32_t xcc_id) > +{ > + int i, r; > + /* make sure dequeue is complete*/ > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > + mutex_lock(&adev->srbm_mutex); > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); > + for (i = 0; i < adev->usec_timeout; i++) { > + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1)) > + break; > + udelay(1); > + } > + if (i >= adev->usec_timeout) > + r = -ETIMEDOUT; > + else > + r = 0; > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > + mutex_unlock(&adev->srbm_mutex); > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > + > + return r; > + > +} > + > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev) > +{ > + /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as well.*/ > + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && > + adev->gfx.mec_fw_version >= 0x0000009b) > + return true; > + else > + dev_warn_once(adev->dev, "Please use the latest MEC version to see whether support pipe reset\n"); > + > + return false; > +} > + > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, > + uint32_t queue_type, uint32_t me, > + uint32_t pipe, uint32_t queue, > + uint32_t xcc_id) > +{ > + struct amdgpu_device *adev = kiq_ring->adev; > + uint32_t reset_pipe, clean_pipe; > + int r; > + > + if (!gfx_v9_4_3_pipe_reset_support(adev)) > + return -EINVAL; > + > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); > + mutex_lock(&adev->srbm_mutex); > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); MEC_CNTL is not a per-queue register. So not sure if selecting this makes sense here. The mutex may be taken so that resets for other queues in the same XCC don't run in parallel. > + > + reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL); > + clean_pipe = reset_pipe; I think the saved value could be written back (skipping the SET_FIELD steps below) as the only change that's done here is to set the PIPEx_RESET field. Apart from those, looks fine. Thanks, Lijo > + > + if (me == 1) { > + switch (pipe) { > + case 0: > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE0_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE0_RESET, 0); > + break; > + case 1: > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE1_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE1_RESET, 0); > + break; > + case 2: > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE2_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE2_RESET, 0); > + break; > + case 3: > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE3_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME1_PIPE3_RESET, 0); > + break; > + default: > + break; > + } > + } else { > + if (pipe) { > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME2_PIPE1_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME2_PIPE1_RESET, 0); > + } else { > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, > + MEC_ME2_PIPE0_RESET, 1); > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, > + MEC_ME2_PIPE0_RESET, 0); > + } > + } > + > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe); > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe); > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); > + mutex_unlock(&adev->srbm_mutex); > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); > + > + r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id); > + return r; > +} > + > static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > unsigned int vmid) > { > @@ -3473,7 +3588,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id]; > struct amdgpu_ring *kiq_ring = &kiq->ring; > unsigned long flags; > - int r, i; > + int r; > > if (amdgpu_sriov_vf(adev)) > return -EINVAL; > @@ -3495,26 +3610,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, > spin_unlock_irqrestore(&kiq->ring_lock, flags); > > r = amdgpu_ring_test_ring(kiq_ring); > - if (r) > - return r; > - > - /* make sure dequeue is complete*/ > - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id); > - mutex_lock(&adev->srbm_mutex); > - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id)); > - for (i = 0; i < adev->usec_timeout; i++) { > - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) > - break; > - udelay(1); > - } > - if (i >= adev->usec_timeout) > - r = -ETIMEDOUT; > - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id)); > - mutex_unlock(&adev->srbm_mutex); > - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id); > if (r) { > - dev_err(adev->dev, "fail to wait on hqd deactive\n"); > - return r; > + dev_err(adev->dev, "kiq ring test failed after ring: %s queue reset\n", > + ring->name); > + goto pipe_reset; > + } > + > + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id); > + if (r) > + dev_err(adev->dev, "fail to wait on hqd deactive and will try pipe reset\n"); > + > +pipe_reset: > + if(r) { > + r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type, > + ring->me, ring->pipe, > + ring->queue, ring->xcc_id); > + dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name, > + r ? "failed" : "successfully"); > + if (r) > + return r; > } > > r = amdgpu_bo_reserve(ring->mqd_obj, false);