Am 20.03.2018 um 07:29 schrieb Emily Deng: > under SR-IOV sometimes CPU based tlb flush would timeout > within the given 100ms period, instead let it fail and > continue we can give it more chance to repeat the > tlb flush on the failed VMHUB > > this could fix the massive "Timeout waiting for VM flush ACK" > error during vk_encoder test. Well that one is a big NAK since it once more just hides the real problem that we sometimes drop register writes. What we did during debugging to avoid the problem is the following: > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index a70cbc45c4c1..3536d50375fa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct > amdgpu_device *adev, > Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â u32 tmp = gmc_v9_0_get_invalidate_req(vmid); > > Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); > +Â Â Â Â Â Â Â Â Â Â Â Â Â Â while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) != tmp) { > +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â DRM_ERROR("Need one more try to write the > VMHUB flush request!"); > +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); > +Â Â Â Â Â Â Â Â Â Â Â Â Â Â } > > Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â /* Busy wait for ACK.*/ > Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â for (j = 0; j < 100; j++) { But that can only be a temporary workaround as well. The question is rather can you reliable reproduce this issue with the vk_encoder test? Thanks, Christian. > > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++----- > 1 file changed, 19 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index a70cbc4..517712b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, > { > /* Use register 17 for GART */ > const unsigned eng = 17; > - unsigned i, j; > + unsigned i, j, loop = 0; > + unsigned flush_done = 0; > + > +retry: > > spin_lock(&adev->gmc.invalidate_lock); > > for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) { > struct amdgpu_vmhub *hub = &adev->vmhub[i]; > u32 tmp = gmc_v9_0_get_invalidate_req(vmid); > + if (flush_done & (1 << i)) /* this vmhub flushed */ > + continue; > > WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); > > @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, > break; > cpu_relax(); > } > - if (j < 100) > + if (j < 100) { > + flush_done |= (1 << i); > continue; > + } > > /* Wait for ACK with a delay.*/ > for (j = 0; j < adev->usec_timeout; j++) { > @@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, > break; > udelay(1); > } > - if (j < adev->usec_timeout) > + if (j < adev->usec_timeout) { > + flush_done |= (1 << i); > continue; > - > - DRM_ERROR("Timeout waiting for VM flush ACK!\n"); > + } > } > > spin_unlock(&adev->gmc.invalidate_lock); > + if (flush_done != 3) { > + if (loop++ < 3) > + goto retry; > + else > + DRM_ERROR("Timeout waiting for VM flush ACK!\n"); > + } > } > > + > static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > unsigned vmid, uint64_t pd_addr) > {