Hi Chris, I have removed DRM_WARN_ONCE. I think we can write mmhub patch firstly. And It's for ticket: http://ontrack-internal.amd.com/browse/SWDEV-201459 According to Yang,Zilong's comments on this issue, GFXHUB is not applicable to the bug thus doesn't need the w/a. I'll see gfxhub hang root cause with Lisa in the following time. Could you please help review my new patch(remove DRM_WARN_ONCE)? BR, Changfeng. -----Original Message----- From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Sent: Wednesday, November 20, 2019 7:27 PM To: Zhu, Changfeng <Changfeng.Zhu@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Xiao, Jack <Jack.Xiao@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Huang, Ray <Ray.Huang@xxxxxxx>; Huang, Shimmer <Xinmei.Huang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10 Am 20.11.19 um 10:44 schrieb Changfeng.Zhu: > From: changzhu <Changfeng.Zhu@xxxxxxx> > > It may lose gpuvm invalidate acknowldege state across power-gating off > cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore > acquire before invalidation and semaphore release after invalidation. > > After adding semaphore acquire before invalidation, the semaphore > register become read-only if another process try to acquire semaphore. > Then it will not be able to release this semaphore. Then it may cause > deadlock problem. If this deadlock problem happens, it needs a > semaphore firmware fix. Please remove the DRM_WARN_ONCE, that looks like overkill to me. And I'm not sure how urgent that issue here is. We could also wait a few more days and see if the hw guys figure out why this lockups on the GFX ring. Regards, Christian. > > Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af > Signed-off-by: changzhu <Changfeng.Zhu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49 ++++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 49 ++++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/soc15.h | 4 +-- > 3 files changed, 100 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index af2615ba52aa..685d0d5ef31e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, > const unsigned eng = 17; > unsigned int i; > > + spin_lock(&adev->gmc.invalidate_lock); > + /* > + * It may lose gpuvm invalidate acknowldege state across power-gating > + * off cycle, add semaphore acquire before invalidation and semaphore > + * release after invalidation to avoid entering power gated state > + * to WA the Issue > + */ > + for (i = 0; i < adev->usec_timeout; i++) { > + /* a read return value of 1 means semaphore acuqire */ > + tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng); > + if (tmp & 0x1) > + break; > + udelay(1); > + } > + > + if (i >= adev->usec_timeout) > + DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n"); > + > WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); > > /* > @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid, > udelay(1); > } > > + /* > + * add semaphore release after invalidation, > + * write with 0 means semaphore release > + */ > + WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0); > + > + spin_unlock(&adev->gmc.invalidate_lock); > + > if (i < adev->usec_timeout) > return; > > @@ -338,6 +364,21 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0); > unsigned eng = ring->vm_inv_eng; > > + /* > + * It may lose gpuvm invalidate acknowldege state across power-gating > + * off cycle, add semaphore acquire before invalidation and semaphore > + * release after invalidation to avoid entering power gated state > + * to WA the Issue > + */ > + > + /* a read return value of 1 means semaphore acuqire */ > + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 || > + ring->funcs->vmhub == AMDGPU_MMHUB_1) { > + amdgpu_ring_emit_reg_wait(ring, > + hub->vm_inv_eng0_sem + eng, 0x1, 0x1); > + DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n"); > + } > + > amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid), > lower_32_bits(pd_addr)); > > @@ -348,6 +389,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > hub->vm_inv_eng0_ack + eng, > req, 1 << vmid); > > + /* > + * add semaphore release after invalidation, > + * write with 0 means semaphore release > + */ > + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 || > + ring->funcs->vmhub == AMDGPU_MMHUB_1) > + amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0); > + > return pd_addr; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 1ae59af7836a..c4118cbb0fbe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, > } > > spin_lock(&adev->gmc.invalidate_lock); > + > + /* > + * It may lose gpuvm invalidate acknowldege state across power-gating > + * off cycle, add semaphore acquire before invalidation and semaphore > + * release after invalidation to avoid entering power gated state > + * to WA the Issue > + */ > + for (j = 0; j < adev->usec_timeout; j++) { > + /* a read return value of 1 means semaphore acuqire */ > + tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng); > + if (tmp & 0x1) > + break; > + udelay(1); > + } > + > + if (j >= adev->usec_timeout) > + DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n"); > + > WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp); > > /* > @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, > break; > udelay(1); > } > + > + /* > + * add semaphore release after invalidation, > + * write with 0 means semaphore release > + */ > + WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0); > + > spin_unlock(&adev->gmc.invalidate_lock); > + > if (j < adev->usec_timeout) > return; > > @@ -486,6 +512,21 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0); > unsigned eng = ring->vm_inv_eng; > > + /* > + * It may lose gpuvm invalidate acknowldege state across power-gating > + * off cycle, add semaphore acquire before invalidation and semaphore > + * release after invalidation to avoid entering power gated state > + * to WA the Issue > + */ > + > + /* a read return value of 1 means semaphore acuqire */ > + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 || > + ring->funcs->vmhub == AMDGPU_MMHUB_1) { > + amdgpu_ring_emit_reg_wait(ring, > + hub->vm_inv_eng0_sem + eng, 0x1, 0x1); > + DRM_WARN_ONCE("Adding semaphore may cause deadlock and it needs firmware fix\n"); > + } > + > amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid), > lower_32_bits(pd_addr)); > > @@ -496,6 +537,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > hub->vm_inv_eng0_ack + eng, > req, 1 << vmid); > > + /* > + * add semaphore release after invalidation, > + * write with 0 means semaphore release > + */ > + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 || > + ring->funcs->vmhub == AMDGPU_MMHUB_1) > + amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0); > + > return pd_addr; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h > b/drivers/gpu/drm/amd/amdgpu/soc15.h > index 9af6c6ffbfa2..57af489a5de3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15.h > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h > @@ -28,8 +28,8 @@ > #include "nbio_v7_0.h" > #include "nbio_v7_4.h" > > -#define SOC15_FLUSH_GPU_TLB_NUM_WREG 4 > -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT 1 > +#define SOC15_FLUSH_GPU_TLB_NUM_WREG 6 > +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT 3 > > extern const struct amd_ip_funcs soc15_common_ip_funcs; > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx