Implement the compute pipe reset and driver will fallback to pipe reset when queue reset failed. Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx> --- v2: Convert the GC logic instance to physical instance in the register accessing process and use the dev_* print to specify the failed device. --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 + drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 ++++++++++++++++++++---- 2 files changed, 138 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index e28c1ebfa98f..d4d74ba2bc27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -143,6 +143,11 @@ struct kiq_pm4_funcs { uint32_t queue_type, uint32_t me_id, uint32_t pipe_id, uint32_t queue_id, uint32_t xcc_id, uint32_t vmid); + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring, + uint32_t queue_type, uint32_t me, + uint32_t pipe, uint32_t queue, + uint32_t xcc_id); + /* Packet sizes */ int set_resources_size; int map_queues_size; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 2067f26d3a9d..ab9d5adbbfe8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, struct amdgpu_cu_info *cu_info); static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id); static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id); +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, + uint32_t queue_type, uint32_t me, + uint32_t pipe, uint32_t queue, + uint32_t xcc_id); static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask) @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = { .kiq_query_status = gfx_v9_4_3_kiq_query_status, .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs, .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue, + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe, .set_resources_size = 8, .map_queues_size = 7, .unmap_queues_size = 6, @@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable) } } +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me, + uint32_t pipe, uint32_t queue, + uint32_t xcc_id) +{ + int i, r; + /* make sure dequeue is complete*/ + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); + mutex_lock(&adev->srbm_mutex); + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); + for (i = 0; i < adev->usec_timeout; i++) { + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1)) + break; + udelay(1); + } + if (i >= adev->usec_timeout) + r = -ETIMEDOUT; + else + r = 0; + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); + mutex_unlock(&adev->srbm_mutex); + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); + + return r; + +} + +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev) +{ + + if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) { + DRM_WARN_ONCE("MEC firmware version too old, please use FW no older than 155!\n"); + return false; + } + + return true; +} + +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring, + uint32_t queue_type, uint32_t me, + uint32_t pipe, uint32_t queue, + uint32_t xcc_id) +{ + struct amdgpu_device *adev = kiq_ring->adev; + uint32_t reset_pipe, clean_pipe; + int r; + + if (!gfx_v9_4_3_pipe_reset_support(adev)) + return -EINVAL; + + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id); + mutex_lock(&adev->srbm_mutex); + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id)); + + reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL); + clean_pipe = reset_pipe; + + if (me == 1) { + switch (pipe) { + case 0: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 0); + break; + case 1: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 0); + break; + case 2: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 0); + break; + case 3: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 0); + break; + default: + break; + } + } else { + if (pipe) { + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 0); + } else { + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 0); + } + } + + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe); + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe); + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id)); + mutex_unlock(&adev->srbm_mutex); + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id); + + r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id); + return r; +} + static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, unsigned int vmid) { @@ -3473,7 +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; unsigned long flags; - int r, i; + int r; if (amdgpu_sriov_vf(adev)) return -EINVAL; @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, spin_unlock_irqrestore(&kiq->ring_lock, flags); r = amdgpu_ring_test_ring(kiq_ring); - if (r) - return r; - - /* make sure dequeue is complete*/ - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id); - mutex_lock(&adev->srbm_mutex); - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id)); - for (i = 0; i < adev->usec_timeout; i++) { - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) - break; - udelay(1); - } - if (i >= adev->usec_timeout) - r = -ETIMEDOUT; - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id)); - mutex_unlock(&adev->srbm_mutex); - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id); if (r) { - dev_err(adev->dev, "fail to wait on hqd deactive\n"); - return r; + DRM_ERROR("kiq ring test failed after ring: %s queue reset\n", + ring->name); + goto pipe_reset; + } + + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id); + if (r) + dev_err(adev->dev,"fail to wait on hqd deactive and will try pipe reset\n"); + +pipe_reset: + if(r) { + r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type, + ring->me, ring->pipe, + ring->queue, ring->xcc_id); + DRM_INFO("ring: %s pipe reset :%s\n", ring->name, + r ? "failed" : "successfully"); + if (r) + return r; } r = amdgpu_bo_reserve(ring->mqd_obj, false); -- 2.34.1