[AMD Official Use Only - Internal Distribution Only]
Hi, Christian,
amdgpu_device_skip_hw_access will always assert in reset
thread, which seems not a good idea.
Best Regards
Dennis Li
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx>
Sent: Tuesday, March 9, 2021 2:07 AM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Li, Dennis
<Dennis.Li@xxxxxxx>
Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
From: Dennis Li <Dennis.Li@xxxxxxx>
When recovery thread has begun GPU reset, there should be not other
threads to access hardware, otherwise system randomly hang.
v2 (chk): rewritten from scratch, use trylock and lockdep instead
of
hand wiring the logic.
Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx>
Signed-off-by: Christian König <christian.koenig@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74
+++++++++++++++++-----
1 file changed, 57 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..c990af6a43ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct
amdgpu_device *adev, loff_t pos,
/*
* register access helper functions.
*/
+
+/* Check if hw access should be skipped because of hotplug or
+device error */ static bool amdgpu_device_skip_hw_access(struct
+amdgpu_device
+*adev) {
+if (adev->in_pci_err_recovery)
+return true;
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * This is a bit complicated to understand, so worth a comment.
What we assert
+ * here is that the GPU reset is not running on another thread in
parallel.
+ *
+ * For this we trylock the read side of the reset semaphore, if
that succeeds
+ * we know that the reset is not running in paralell.
+ *
+ * If the trylock fails we assert that we are either already
holding the read
+ * side of the lock or are the reset thread itself and hold the
write side of
+ * the lock.
+ */
+if (down_read_trylock(&adev->reset_sem))
+up_read(&adev->reset_sem);
+else
+lockdep_assert_held(&adev->reset_sem);
+#endif
+
+return false;
+}
+
/**
* amdgpu_device_rreg - read a memory mapped IO or indirect
register
*
@@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct
amdgpu_device *adev, {
uint32_t ret;
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return 0;
if ((reg * 4) < adev->rmmio_size) { @@ -377,7 +405,7 @@ uint32_t
amdgpu_device_rreg(struct amdgpu_device *adev,
*/
uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t
offset) {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return 0;
if (offset < adev->rmmio_size)
@@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device
*adev, uint32_t offset)
*/
void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset,
uint8_t value) { -if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if (offset < adev->rmmio_size)
@@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device
*adev,
uint32_t reg, uint32_t v,
uint32_t acc_flags)
{
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if ((reg * 4) < adev->rmmio_size) { @@ -452,7 +480,7 @@ void
amdgpu_device_wreg(struct amdgpu_device *adev, void
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
uint32_t reg, uint32_t v)
{
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if (amdgpu_sriov_fullaccess(adev) && @@ -475,7 +503,7 @@ void
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
*/
u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { -if
(adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return 0;
if ((reg * 4) < adev->rio_mem_size) @@ -497,7 +525,7 @@ u32
amdgpu_io_rreg(struct amdgpu_device *adev,
u32 reg)
*/
void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
{ -if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if ((reg * 4) < adev->rio_mem_size) @@ -519,7 +547,7 @@ void
amdgpu_io_wreg(struct amdgpu_device *adev,
u32 reg, u32 v)
*/
u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return 0;
if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@
u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index,
u32 v) {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@
void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32
v)
*/
u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
{ -if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return 0;
if (index < adev->doorbell.num_doorbells) { @@ -586,7 +614,7 @@
u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index,
u64 v) {
-if (adev->in_pci_err_recovery)
+if (amdgpu_device_skip_hw_access(adev))
return;
if (index < adev->doorbell.num_doorbells) { @@ -610,10 +638,13 @@
u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
u32 pcie_index, u32 pcie_data,
u32 reg_addr)
{
-unsigned long flags;
-u32 r;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+unsigned long flags;
+u32 r;
+
+if (amdgpu_device_skip_hw_access(adev))
+return 0;
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct
amdgpu_device *adev,
u32 pcie_index, u32 pcie_data,
u32 reg_addr)
{
-unsigned long flags;
-u64 r;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+unsigned long flags;
+u64 r;
+
+if (amdgpu_device_skip_hw_access(adev))
+return 0;
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct
amdgpu_device *adev,
u32 pcie_index, u32 pcie_data,
u32 reg_addr, u32 reg_data)
{
-unsigned long flags;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+unsigned long flags;
+
+if (amdgpu_device_skip_hw_access(adev))
+return;
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct
amdgpu_device *adev,
u32 pcie_index, u32 pcie_data,
u32 reg_addr, u64 reg_data)
{
-unsigned long flags;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+unsigned long flags;
+
+if (amdgpu_device_skip_hw_access(adev))
+return;
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
--
2.25.1