Re: [PATCH] drm/amdgpu: change the fence ring wait timeout

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 13.01.21 um 07:36 schrieb Roy Sun:
This fix bug where when the engine hang, the fence ring will wait without quit and cause kernel crash

NAK, this blocking is intentional unlimited because otherwise we will cause a memory corruption.

What is the actual bug you are trying to fix here?

Regards,
Christian.


Signed-off-by: Roy Sun <Roy.Sun@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48 ++++++++++++++++++++---
  1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 6b0aeee61b8b..738ea65077ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -41,6 +41,8 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
+#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
+#define AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
  /*
   * Fences
   * Fences mark an event in the GPUs pipeline and are used
@@ -104,6 +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
  		*drv->cpu_addr = cpu_to_le32(seq);
  }
+/**
+ * amdgpu_fence_wait_timeout - get the fence wait timeout
+ *
+ * @ring: ring the fence is associated with
+ *
+ * Returns the value of the fence wait timeout.
+ */
+long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring)
+{
+	long tmo_gfx, tmo_mm, tmo;
+	struct amdgpu_device *adev = ring->adev;
+	tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
+	if (amdgpu_sriov_vf(adev)) {
+		tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
+	}
+	if (amdgpu_sriov_runtime(adev)) {
+		tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
+	} else if (adev->gmc.xgmi.hive_id) {
+		tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
+	}
+	if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
+		ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
+		ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
+		ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
+		ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
+		ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
+		tmo = tmo_mm;
+	else
+		tmo = tmo_gfx;
+	return tmo;
+}
+
  /**
   * amdgpu_fence_read - read a fence value
   *
@@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
  		rcu_read_unlock();
if (old) {
-			r = dma_fence_wait(old, false);
+			long timeout;
+			timeout = amdgpu_fence_wait_timeout(ring);
+			r = dma_fence_wait_timeout(old, false, timeout);
  			dma_fence_put(old);
  			if (r)
-				return r;
+				return r < 0 ? r : 0;
  		}
  	}
@@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
  		return 0;
  	}
  	rcu_read_unlock();
-
-	r = dma_fence_wait(fence, false);
+	
+	long timeout;
+	timeout = amdgpu_fence_wait_timeout(ring);
+	r = dma_fence_wait_timeout(fence, false, timeout);
  	dma_fence_put(fence);
-	return r;
+	return r < 0 ? r : 0;
  }
/**

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux