Fix hypervisor save_vf fail issue which hit after guest drv unloaded. the reason of SAVE_VF will fail is: KIQ and KCQ still active after drv unloaded, RLCV will command CPC to run MQD (to save current status) on all queues if they are still active the fix is to unmap KCQ and disable KIQ/HIQ in gfx fini, thus we implement new routines to unmap KCQ for gfx8/9, and we disable KIQ/HIQ in RLC registers thus RLCV won't initiate CPC do the MQD commands. Change-Id: I95eb650f4bd16b639ca6e773efce80abb5e04641 Signed-off-by: Horace Chen <horace.chen at amd.com> Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 63 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 64 +++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 0c4a3b8..14be0bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -5034,6 +5034,57 @@ static int gfx_v8_0_hw_init(void *handle) return r; } +static int gfx_v8_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = kiq_ring->adev; + uint32_t scratch, tmp = 0; + int r, i; + + r = amdgpu_gfx_scratch_get(adev, &scratch); + if (r) { + DRM_ERROR("Failed to get scratch reg (%d).\n", r); + return r; + } + WREG32(scratch, 0xCAFEDEAD); + + r = amdgpu_ring_alloc(kiq_ring, 10); + if (r) { + DRM_ERROR("Failed to lock KIQ (%d).\n", r); + amdgpu_gfx_scratch_free(adev, scratch); + return r; + } + + /* unmap queues */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4)); + amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */ + PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */ + PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) | + PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) | + PACKET3_UNMAP_QUEUES_NUM_QUEUES(1)); + amdgpu_ring_write(kiq_ring, PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + /* write to scratch for completion */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1)); + amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START)); + amdgpu_ring_write(kiq_ring, 0xDEADBEEF); + amdgpu_ring_commit(kiq_ring); + + for (i = 0; i < adev->usec_timeout; i++) { + tmp = RREG32(scratch); + if (tmp == 0xDEADBEEF) + break; + DRM_UDELAY(1); + } + if (i >= adev->usec_timeout) { + DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", scratch, tmp); + r = -EINVAL; + } + amdgpu_gfx_scratch_free(adev, scratch); + return r; +} + static int gfx_v8_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -5041,6 +5092,18 @@ static int gfx_v8_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); if (amdgpu_sriov_vf(adev)) { + uint32_t value; + int i; + + /* disable KCQ to avoid CPC touch memory not valid anymore */ + for (i = 0; i < adev->gfx.num_compute_rings; i++) + gfx_v8_0_kcq_disable(&adev->gfx.kiq.ring, &adev->gfx.compute_ring[i]); + + /* disable KIQ & HIQ */ + value = RREG32(mmRLC_CP_SCHEDULERS); + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler0, 0); + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 0); + WREG32(mmRLC_CP_SCHEDULERS,value); pr_debug("For SRIOV client, shouldn't do anything.\n"); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index e2ae00d..f1f34a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2895,6 +2895,58 @@ static int gfx_v9_0_hw_init(void *handle) return r; } +static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = kiq_ring->adev; + uint32_t scratch, tmp = 0; + int r, i; + + r = amdgpu_gfx_scratch_get(adev, &scratch); + if (r) { + DRM_ERROR("Failed to get scratch reg (%d).\n", r); + return r; + } + WREG32(scratch, 0xCAFEDEAD); + + r = amdgpu_ring_alloc(kiq_ring, 10); + if (r) { + DRM_ERROR("Failed to lock KIQ (%d).\n", r); + amdgpu_gfx_scratch_free(adev, scratch); + return r; + } + + /* unmap queues */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4)); + amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */ + PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */ + PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) | + PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) | + PACKET3_UNMAP_QUEUES_NUM_QUEUES(1)); + amdgpu_ring_write(kiq_ring, PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + /* write to scratch for completion */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1)); + amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START)); + amdgpu_ring_write(kiq_ring, 0xDEADBEEF); + amdgpu_ring_commit(kiq_ring); + + for (i = 0; i < adev->usec_timeout; i++) { + tmp = RREG32(scratch); + if (tmp == 0xDEADBEEF) + break; + DRM_UDELAY(1); + } + if (i >= adev->usec_timeout) { + DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", scratch, tmp); + r = -EINVAL; + } + amdgpu_gfx_scratch_free(adev, scratch); + return r; +} + + static int gfx_v9_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -2902,6 +2954,18 @@ static int gfx_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); if (amdgpu_sriov_vf(adev)) { + uint32_t value; + int i; + + /* disable KCQ to avoid CPC touch memory not valid anymore */ + for (i = 0; i < adev->gfx.num_compute_rings; i++) + gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev->gfx.compute_ring[i]); + + /* disable KIQ & HIQ */ + value = RREG32(mmRLC_CP_SCHEDULERS); + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler0, 0); + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 0); + WREG32(mmRLC_CP_SCHEDULERS,value); pr_debug("For SRIOV client, shouldn't do anything.\n"); return 0; } -- 2.7.4