[PATCH] drm/amdgpu: give more chance for tlb flush if failed

ckoenig.leichtzumerken@xxxxxxxxx (Christian König) · Tue, 20 Mar 2018 11:23:53 +0100

Am 20.03.2018 um 07:29 schrieb Emily Deng:
> under SR-IOV sometimes CPU based tlb flush would timeout
> within the given 100ms period, instead let it fail and
> continue we can give it more chance to repeat the
> tlb flush on the failed VMHUB
>
> this could fix the massive "Timeout waiting for VM flush ACK"
> error during vk_encoder test.

Well that one is a big NAK since it once more just hides the real 
problem that we sometimes drop register writes.

What we did during debugging to avoid the problem is the following:
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc45c4c1..3536d50375fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct 
> amdgpu_device *adev,
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â  while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) != tmp) {
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  DRM_ERROR("Need one more try to write the 
> VMHUB flush request!");
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â  }
>
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  /* Busy wait for ACK.*/
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  for (j = 0; j < 100; j++) {

But that can only be a temporary workaround as well.

The question is rather can you reliable reproduce this issue with the 
vk_encoder test?

Thanks,
Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++-----
>   1 file changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index a70cbc4..517712b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   {
>   	/* Use register 17 for GART */
>   	const unsigned eng = 17;
> -	unsigned i, j;
> +	unsigned i, j, loop = 0;
> +	unsigned flush_done = 0;
> +
> +retry:
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
>   
>   	for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
>   		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
> +		if (flush_done & (1 << i)) /* this vmhub flushed */
> +			continue;
>   
>   		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
> @@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   				break;
>   			cpu_relax();
>   		}
> -		if (j < 100)
> +		if (j < 100) {
> +			flush_done |= (1 << i);
>   			continue;
> +		}
>   
>   		/* Wait for ACK with a delay.*/
>   		for (j = 0; j < adev->usec_timeout; j++) {
> @@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   				break;
>   			udelay(1);
>   		}
> -		if (j < adev->usec_timeout)
> +		if (j < adev->usec_timeout) {
> +			flush_done |= (1 << i);
>   			continue;
> -
> -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> +		}
>   	}
>   
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +	if (flush_done != 3) {
> +		if (loop++ < 3)
> +			goto retry;
> +		else
> +			DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> +	}
>   }
>   
> +
>   static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    unsigned vmid, uint64_t pd_addr)
>   {