[AMD Official Use Only - Internal Distribution Only] Hi, Andrey, RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ? Deli: down_read_trylock will fail in this case, return false immediately and will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead of KIQ to access registers. Best Regards Dennis Li -----Original Message----- From: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx> Sent: Tuesday, September 1, 2020 9:40 AM To: Li, Dennis <Dennis.Li@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx> Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery On 8/31/20 9:17 PM, Dennis Li wrote: > When GPU is in reset, its status isn't stable and ring buffer also > need be reset when resuming. Therefore driver should protect GPU > recovery thread from ring buffer accessed by other threads. Otherwise > GPU will randomly hang during recovery. > > Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 172dc47b7f39..8db56a22cd1b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, > { > uint32_t ret; > > - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) > - return amdgpu_kiq_rreg(adev, reg); > + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && > + amdgpu_sriov_runtime(adev) && > + down_read_trylock(&adev->reset_sem)) { > + ret = amdgpu_kiq_rreg(adev, reg); > + up_read(&adev->reset_sem); > + return ret; > + } Isn't adev->reset_sem non-recursive ? How this works when you try to access registers from within GPU reset thread while adev->reset_sem is already write locked from amdgpu_device_lock_adev earlier in the same thread ? Andrey > > if ((reg * 4) < adev->rmmio_size) > ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -332,6 > +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, > ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); > spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); > } > + > trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); > return ret; > } > @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, > void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, > uint32_t acc_flags) > { > - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) > - return amdgpu_kiq_wreg(adev, reg, v); > + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && > + amdgpu_sriov_runtime(adev) && > + down_read_trylock(&adev->reset_sem)) { > + amdgpu_kiq_wreg(adev, reg, v); > + up_read(&adev->reset_sem); > + return; > + } > > amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index ad9ad622ccce..4ea2a065daa9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, > */ > if (adev->gfx.kiq.ring.sched.ready && > (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && > - !amdgpu_in_reset(adev)) { > + down_read_trylock(&adev->reset_sem)) { > > struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; > const unsigned eng = 17; > @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct > amdgpu_device *adev, uint32_t vmid, > > amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, > 1 << vmid); > + > + up_read(&adev->reset_sem); > return; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index e1a0ae327cf5..33b7cf1c79ec 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, > */ > if (adev->gfx.kiq.ring.sched.ready && > (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && > - !amdgpu_in_reset(adev)) { > + down_read_trylock(&adev->reset_sem)) { > uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; > uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; > > amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, > 1 << vmid); > + up_read(&adev->reset_sem); > return; > } > > @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, > if (amdgpu_in_reset(adev)) > return -EIO; > > - if (ring->sched.ready) { > + if (ring->sched.ready && > + down_read_trylock(&adev->reset_sem)) { > /* Vega20+XGMI caches PTEs in TC and TLB. Add a > * heavy-weight TLB flush (type 2), which flushes > * both. Due to a race condition with concurrent @@ -626,6 +628,7 > @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, > if (r) { > amdgpu_ring_undo(ring); > spin_unlock(&adev->gfx.kiq.ring_lock); > + up_read(&adev->reset_sem); > return -ETIME; > } > > @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, > r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); > if (r < 1) { > dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); > + up_read(&adev->reset_sem); > return -ETIME; > } > - > + up_read(&adev->reset_sem); > return 0; > } > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx