[AMD Official Use Only - Internal Distribution Only] Hi, Christian, amdgpu_device_skip_hw_access will always assert in reset thread, which seems not a good idea. Best Regards Dennis Li -----Original Message----- From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Sent: Tuesday, March 9, 2021 2:07 AM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2 From: Dennis Li <Dennis.Li@xxxxxxx> When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang. v2 (chk): rewritten from scratch, use trylock and lockdep instead of hand wiring the logic. Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> Signed-off-by: Christian König <christian.koenig@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +++++++++++++++++----- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e247c3a2ec08..c990af6a43ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, /* * register access helper functions. */ + +/* Check if hw access should be skipped because of hotplug or device +error */ static bool amdgpu_device_skip_hw_access(struct amdgpu_device +*adev) { +if (adev->in_pci_err_recovery) +return true; + +#ifdef CONFIG_LOCKDEP +/* + * This is a bit complicated to understand, so worth a comment. What we assert + * here is that the GPU reset is not running on another thread in parallel. + * + * For this we trylock the read side of the reset semaphore, if that succeeds + * we know that the reset is not running in paralell. + * + * If the trylock fails we assert that we are either already holding the read + * side of the lock or are the reset thread itself and hold the write side of + * the lock. + */ +if (down_read_trylock(&adev->reset_sem)) +up_read(&adev->reset_sem); +else +lockdep_assert_held(&adev->reset_sem); +#endif + +return false; +} + /** * amdgpu_device_rreg - read a memory mapped IO or indirect register * @@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, { uint32_t ret; -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return 0; if ((reg * 4) < adev->rmmio_size) { @@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return 0; if (offset < adev->rmmio_size) @@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if (offset < adev->rmmio_size) @@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if ((reg * 4) < adev->rmmio_size) { @@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if (amdgpu_sriov_fullaccess(adev) && @@ -475,7 +503,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return 0; if ((reg * 4) < adev->rio_mem_size) @@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if ((reg * 4) < adev->rio_mem_size) @@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -586,7 +614,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { -if (adev->in_pci_err_recovery) +if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -610,10 +638,13 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, u32 pcie_index, u32 pcie_data, u32 reg_addr) { -unsigned long flags; -u32 r; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; +unsigned long flags; +u32 r; + +if (amdgpu_device_skip_hw_access(adev)) +return 0; spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, u32 pcie_index, u32 pcie_data, u32 reg_addr) { -unsigned long flags; -u64 r; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; +unsigned long flags; +u64 r; + +if (amdgpu_device_skip_hw_access(adev)) +return 0; spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, u32 pcie_index, u32 pcie_data, u32 reg_addr, u32 reg_data) { -unsigned long flags; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; +unsigned long flags; + +if (amdgpu_device_skip_hw_access(adev)) +return; spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, u32 pcie_index, u32 pcie_data, u32 reg_addr, u64 reg_data) { -unsigned long flags; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; +unsigned long flags; + +if (amdgpu_device_skip_hw_access(adev)) +return; spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; -- 2.25.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx