RE: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

"Zhu, Changfeng" <Changfeng.Zhu@xxxxxxx> · Thu, 21 Nov 2019 15:53:13 +0000

Hi Chris,

I have removed DRM_WARN_ONCE.

I think we can write mmhub patch firstly. And It's for ticket:
http://ontrack-internal.amd.com/browse/SWDEV-201459

According to Yang,Zilong's comments on this issue,
GFXHUB is not applicable to the bug thus doesn't need the w/a.

I'll see gfxhub hang root cause with Lisa in the following time.

Could you please help review my new patch(remove DRM_WARN_ONCE)?

BR,
Changfeng.

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> 
Sent: Wednesday, November 20, 2019 7:27 PM
To: Zhu, Changfeng <Changfeng.Zhu@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Xiao, Jack <Jack.Xiao@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Huang, Ray <Ray.Huang@xxxxxxx>; Huang, Shimmer <Xinmei.Huang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> From: changzhu <Changfeng.Zhu@xxxxxxx>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off 
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore 
> acquire before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore 
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause 
> deadlock problem. If this deadlock problem happens, it needs a 
> semaphore firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few more days and see if the hw guys figure out why this lockups on the GFX ring.

Regards,
Christian.

>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu@xxxxxxx>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
>   3 files changed, 100 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..685d0d5ef31e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   	const unsigned eng = 17;
>   	unsigned int i;
>   
> +	spin_lock(&adev->gmc.invalidate_lock);
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (i >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
>   		udelay(1);
>   	}
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> +	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (i < adev->usec_timeout)
>   		return;
>   
> @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 1ae59af7836a..c4118cbb0fbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	}
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
> +
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +	for (j = 0; j < adev->usec_timeout; j++) {
> +		/* a read return value of 1 means semaphore acuqire */
> +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> +		if (tmp & 0x1)
> +			break;
> +		udelay(1);
> +	}
> +
> +	if (j >= adev->usec_timeout)
> +		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> +
>   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   	/*
> @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			break;
>   		udelay(1);
>   	}
> +
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
>   	spin_unlock(&adev->gmc.invalidate_lock);
> +
>   	if (j < adev->usec_timeout)
>   		return;
>   
> @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
>   	unsigned eng = ring->vm_inv_eng;
>   
> +	/*
> +	 * It may lose gpuvm invalidate acknowldege state across power-gating
> +	 * off cycle, add semaphore acquire before invalidation and semaphore
> +	 * release after invalidation to avoid entering power gated state
> +	 * to WA the Issue
> +	 */
> +
> +	/* a read return value of 1 means semaphore acuqire */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
> +	}
> +
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
>   			      lower_32_bits(pd_addr));
>   
> @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
>   
> +	/*
> +	 * add semaphore release after invalidation,
> +	 * write with 0 means semaphore release
> +	 */
> +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
>   	return pd_addr;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h 
> b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 9af6c6ffbfa2..57af489a5de3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
>   #include "nbio_v7_0.h"
>   #include "nbio_v7_4.h"
>   
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
>   
>   extern const struct amd_ip_funcs soc15_common_ip_funcs;
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx