[SWDEV-126631] - fix hypervisor save_vf fail that occured after driver removed: 1. Because the KIQ and KCQ were not ummapped, save_vf will fail if driver freed mqd of KIQ and KCQ. 2. KIQ can't be unmapped since RLCV always need it, the bo_free on KIQ should be skipped 3. KCQ can be unmapped, and should be unmapped during hw_fini, 4. RLCV still need to access other mc address from some hw even after driver unloaded, So we should not unbind gart for VF. Change-Id: I320487a9a848f41484c5f8cc11be34aca807b424 Signed-off-by: Horace Chen <horace.chen at amd.com> Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 60 +++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c index f437008..2fee071 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c @@ -394,7 +394,8 @@ int amdgpu_gart_init(struct amdgpu_device *adev) */ void amdgpu_gart_fini(struct amdgpu_device *adev) { - if (adev->gart.ready) { + /* gart is still used by other hw under SRIOV, don't unbind it */ + if (adev->gart.ready && !amdgpu_sriov_vf(adev)) { /* unbind pages */ amdgpu_gart_unbind(adev, 0, adev->gart.num_cpu_pages); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 4f6c68f..bf6656f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -309,6 +309,11 @@ void amdgpu_gfx_compute_mqd_sw_fini(struct amdgpu_device *adev) &ring->mqd_ptr); } + /* don't deallocate KIQ mqd because the bo is still used by RLCV even + the guest VM is shutdown */ + if (amdgpu_sriov_vf(adev)) + return; + ring = &adev->gfx.kiq.ring; kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]); amdgpu_bo_free_kernel(&ring->mqd_obj, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 44960b3..a577bbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2892,14 +2892,72 @@ static int gfx_v9_0_hw_init(void *handle) return r; } +static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = kiq_ring->adev; + uint32_t scratch, tmp = 0; + int r, i; + + r = amdgpu_gfx_scratch_get(adev, &scratch); + if (r) { + DRM_ERROR("Failed to get scratch reg (%d).\n", r); + return r; + } + WREG32(scratch, 0xCAFEDEAD); + + r = amdgpu_ring_alloc(kiq_ring, 10); + if (r) { + DRM_ERROR("Failed to lock KIQ (%d).\n", r); + amdgpu_gfx_scratch_free(adev, scratch); + return r; + } + + /* unmap queues */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4)); + amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */ + PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */ + PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) | + PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) | + PACKET3_UNMAP_QUEUES_NUM_QUEUES(1)); + amdgpu_ring_write(kiq_ring, PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + amdgpu_ring_write(kiq_ring, 0); + /* write to scratch for completion */ + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1)); + amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START)); + amdgpu_ring_write(kiq_ring, 0xDEADBEEF); + amdgpu_ring_commit(kiq_ring); + + for (i = 0; i < adev->usec_timeout; i++) { + tmp = RREG32(scratch); + if (tmp == 0xDEADBEEF) + break; + DRM_UDELAY(1); + } + if (i >= adev->usec_timeout) { + DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", scratch, tmp); + r = -EINVAL; + } + amdgpu_gfx_scratch_free(adev, scratch); + return r; +} + + static int gfx_v9_0_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + int i, r; amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); if (amdgpu_sriov_vf(adev)) { - pr_debug("For SRIOV client, shouldn't do anything.\n"); + /* disable KCQ to avoid CPC touch memory not valid anymore */ + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + r = gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev->gfx.compute_ring[i]); + if (r) + return r; + } return 0; } gfx_v9_0_cp_enable(adev, false); -- 2.7.4