Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
From: changzhu <Changfeng.Zhu@xxxxxxx>

It may lose gpuvm invalidate acknowldege state across power-gating off
cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
before invalidation and semaphore release after invalidation.

After adding semaphore acquire before invalidation, the semaphore
register become read-only if another process try to acquire semaphore.
Then it will not be able to release this semaphore. Then it may cause
deadlock problem. If this deadlock problem happens, it needs a semaphore
firmware fix.

Please remove the DRM_WARN_ONCE, that looks like overkill to me.

And I'm not sure how urgent that issue here is. We could also wait a few more days and see if the hw guys figure out why this lockups on the GFX ring.

Regards,
Christian.


Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
Signed-off-by: changzhu <Changfeng.Zhu@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49 ++++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
  3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index af2615ba52aa..685d0d5ef31e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
  	const unsigned eng = 17;
  	unsigned int i;
+ spin_lock(&adev->gmc.invalidate_lock);
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (i = 0; i < adev->usec_timeout; i++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (i >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
  	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
/*
@@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
  		udelay(1);
  	}
+ /*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
+	spin_unlock(&adev->gmc.invalidate_lock);
+
  	if (i < adev->usec_timeout)
  		return;
@@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
  	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
  	unsigned eng = ring->vm_inv_eng;
+ /*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
  			      lower_32_bits(pd_addr));
@@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
  					    hub->vm_inv_eng0_ack + eng,
  					    req, 1 << vmid);
+ /*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
  	return pd_addr;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 1ae59af7836a..c4118cbb0fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
  	}
spin_lock(&adev->gmc.invalidate_lock);
+
+	/*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+	for (j = 0; j < adev->usec_timeout; j++) {
+		/* a read return value of 1 means semaphore acuqire */
+		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
+		if (tmp & 0x1)
+			break;
+		udelay(1);
+	}
+
+	if (j >= adev->usec_timeout)
+		DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
+
  	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
/*
@@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
  			break;
  		udelay(1);
  	}
+
+	/*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
+
  	spin_unlock(&adev->gmc.invalidate_lock);
+
  	if (j < adev->usec_timeout)
  		return;
@@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
  	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
  	unsigned eng = ring->vm_inv_eng;
+ /*
+	 * It may lose gpuvm invalidate acknowldege state across power-gating
+	 * off cycle, add semaphore acquire before invalidation and semaphore
+	 * release after invalidation to avoid entering power gated state
+	 * to WA the Issue
+	 */
+
+	/* a read return value of 1 means semaphore acuqire */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
+		amdgpu_ring_emit_reg_wait(ring,
+					  hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
+		DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n");
+	}
+
  	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
  			      lower_32_bits(pd_addr));
@@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
  					    hub->vm_inv_eng0_ack + eng,
  					    req, 1 << vmid);
+ /*
+	 * add semaphore release after invalidation,
+	 * write with 0 means semaphore release
+	 */
+	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
+	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
+
  	return pd_addr;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
index 9af6c6ffbfa2..57af489a5de3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
@@ -28,8 +28,8 @@
  #include "nbio_v7_0.h"
  #include "nbio_v7_4.h"
-#define SOC15_FLUSH_GPU_TLB_NUM_WREG 4
-#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
+#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
+#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
extern const struct amd_ip_funcs soc15_common_ip_funcs;

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux