When GPU recovery thread is doing GPU reset, it is unsafe that other threads access hardware concurrently, which could cause GPU reset randomly hang. Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 1624c2bc8285..c71d3bba5f69 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1033,6 +1033,7 @@ struct amdgpu_device { atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; struct rw_semaphore reset_sem; + struct thread_info *recovery_thread; struct amdgpu_doorbell_index doorbell_index; struct mutex notifier_lock; @@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev) { return atomic_read(&adev->in_gpu_reset); } + +static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev) +{ + if (unlikely(adev->recovery_thread != NULL) && + adev->recovery_thread == current_thread_info()) + return true; + + return false; +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 71805dfd9e25..7c17a5468d43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else BUG(); + + if (locked) + up_read(&adev->reset_sem); } /** @@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); + if ((reg * 4) < adev->rmmio_size) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && - amdgpu_sriov_runtime(adev) && - down_read_trylock(&adev->reset_sem)) { + amdgpu_sriov_runtime(adev)) { amdgpu_kiq_wreg(adev, reg, v); - up_read(&adev->reset_sem); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } @@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, adev->pcie_wreg(adev, reg * 4, v); } + if (locked) + up_read(&adev->reset_sem); + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); } @@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } + + if (locked) + up_read(&adev->reset_sem); } /** @@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + bool locked; + if (adev->in_pci_err_recovery) return; + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); } + + if (locked) + up_read(&adev->reset_sem); } /** @@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, unsigned long flags; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; + bool locked; + + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, writel(reg_data, pcie_data_offset); readl(pcie_data_offset); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + if (locked) + up_read(&adev->reset_sem); } /** @@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, unsigned long flags; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; + bool locked; + + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq(); + if (locked) + down_read(&adev->reset_sem); spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, writel((u32)(reg_data >> 32), pcie_data_offset); readl(pcie_data_offset); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + if (locked) + up_read(&adev->reset_sem); } /** @@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, break; } + adev->recovery_thread = current_thread_info(); + return true; } @@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(&adev->in_gpu_reset, 0); + adev->recovery_thread = NULL; up_write(&adev->reset_sem); } @@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, dev_info(adev->dev, "GPU %s begin!\n", need_emergency_restart ? "jobs stop":"reset"); - /* * Here we trylock to avoid chain of resets executing from * either trigger by jobs on different adevs in XGMI hive or jobs on -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx