under SR-IOV sometimes CPU based tlb flush would timeout within the given 100ms period, instead let it fail and continue we can give it more chance to repeat the tlb flush on the failed VMHUB this could fix the massive "Timeout waiting for VM flush ACK" error during vk_encoder test. Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index a70cbc4..517712b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, { /* Use register 17 for GART */ const unsigned eng = 17; - unsigned i, j; + unsigned i, j, loop = 0; + unsigned flush_done = 0; + +retry: spin_lock(&adev->gmc.invalidate_lock); for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) { struct amdgpu_vmhub *hub = &adev->vmhub[i]; u32 tmp = gmc_v9_0_get_invalidate_req(vmid); + if (flush_done & (1 << i)) /* this vmhub flushed */ + continue; WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, break; cpu_relax(); } - if (j < 100) + if (j < 100) { + flush_done |= (1 << i); continue; + } /* Wait for ACK with a delay.*/ for (j = 0; j < adev->usec_timeout; j++) { @@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, break; udelay(1); } - if (j < adev->usec_timeout) + if (j < adev->usec_timeout) { + flush_done |= (1 << i); continue; - - DRM_ERROR("Timeout waiting for VM flush ACK!\n"); + } } spin_unlock(&adev->gmc.invalidate_lock); + if (flush_done != 3) { + if (loop++ < 3) + goto retry; + else + DRM_ERROR("Timeout waiting for VM flush ACK!\n"); + } } + static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, unsigned vmid, uint64_t pd_addr) { -- 2.7.4