Hi Christian, I only found the issue on SRIOV environment, does it also occurs on other cases, for example, thanks. Best Wishes, Emily Deng > -----Original Message----- > From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com] > Sent: Tuesday, March 27, 2018 4:27 PM > To: Deng, Emily <Emily.Deng at amd.com>; Koenig, Christian > <Christian.Koenig at amd.com>; amd-gfx at lists.freedesktop.org > Cc: Liu, Monk <Monk.Liu at amd.com> > Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV > > Hi Emily, > > Am 27.03.2018 um 10:23 schrieb Deng, Emily: > > Hi Christian, > > I think the root cause has been found out. It is the hardware > > restriction that couldn't do world switch between invalidate request, > > and ack. We couldn't control world switch in guest driver, but could replace > the origin two command with one command to avoid world switch. > > Unfortunately the problem was reproduced completely without world switch. > > So the world switch is not the root cause, but only another trigger to the > issue. > > We need to find the root cause or otherwise will not fix the real problem but > only mitigate the effects. > > Regards, > Christian. > > > > > For the multiple times writing, it is also the same root cause, > > when the world switch happened between the invalidate request, and > > ack, it will have problem. So we could give more chance for SRIOV to > > do the invalidate request to workaround the case that world switch > > happens between invalidate request, and ack > > > > Best Wishes, > > Emily Deng > > > >> -----Original Message----- > >> From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com] > >> Sent: Tuesday, March 27, 2018 3:48 PM > >> To: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org > >> Cc: Liu, Monk <Monk.Liu at amd.com> > >> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV > >> > >> Am 27.03.2018 um 07:58 schrieb Emily Deng: > >>> issue: > >>> the vmflush in KCQ could be preempted (not like GFX ring which > >>> doesn't allow preemption in ring buffer) and this lead to vm flush > >>> fail when there is a world switch during the vm flush procedure > >>> (between write invalidate request and query invalidate ack) > >>> > >>> fix: > >>> separate vm flush for gfx and compute ring, and use the new format > >>> command in compute's vm flush which use only one package so no > >>> preemption could allowed > >> NAK, as already discussed multiple times now that only circumvents > >> the problem, but not really fixes it. > >> > >> Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req > >> + eng, req);" multiple times has the same effect and we need to figure out > why. > >> > >> Regards, > >> Christian. > >> > >>> Signed-off-by: Monk Liu <Monk.Liu at amd.com> > >>> Signed-off-by: Emily Deng <Emily.Deng at amd.com> > >>> --- > >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 ++ > >>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +++++++++- > >>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 18 +++++++++++++----- > >>> 4 files changed, 25 insertions(+), 6 deletions(-) > >>> > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> index a7e2229..986659f 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct > >> amdgpu_ring *ring) > >>> #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d)) > >>> #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), > (v)) > >>> #define amdgpu_ring_emit_reg_wait(r, d, v, m) > >>> (r)->funcs->emit_reg_wait((r), (d), (v), (m)) > >>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) > >>> +(r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m)) > >>> #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b)) > >>> #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) > >>> #define amdgpu_ring_init_cond_exec(r) > >>> (r)->funcs->init_cond_exec((r)) diff --git > >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > >>> index 1d0d250..d85df5d 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > >>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs { > >>> void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, > >>> uint32_t > >> val); > >>> void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg, > >>> uint32_t val, uint32_t mask); > >>> + void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0, > >>> + uint32_t reg1, uint32_t val, uint32_t mask); > >>> void (*emit_tmz)(struct amdgpu_ring *ring, bool start); > >>> /* priority functions */ > >>> void (*set_priority) (struct amdgpu_ring *ring, diff --git > >>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >>> index 1ae3de1..509c9d2 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >>> @@ -4078,6 +4078,13 @@ static void > >>> gfx_v9_0_ring_emit_reg_wait(struct > >> amdgpu_ring *ring, uint32_t reg, > >>> gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20); > >>> } > >>> > >>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring > >> *ring, > >>> + uint32_t reg0, uint32_t reg1, > >>> + uint32_t val, uint32_t mask) > >>> +{ > >>> + gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20); > >>> +} > >>> + > >>> static void gfx_v9_0_set_gfx_eop_interrupt_state(struct > >>> amdgpu_device > >> *adev, > >>> enum > >> amdgpu_interrupt_state state) > >>> { > >>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs > >> gfx_v9_0_ring_funcs_compute = { > >>> 7 + /* gfx_v9_0_ring_emit_hdp_flush */ > >>> 5 + /* hdp invalidate */ > >>> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > >>> - SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > >>> + (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 + > >>> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > >>> 2 + /* gfx_v9_0_ring_emit_vm_flush */ > >>> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm > >> fence > >>> */ @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs > >> gfx_v9_0_ring_funcs_compute = { > >>> .set_priority = gfx_v9_0_ring_set_priority_compute, > >>> .emit_wreg = gfx_v9_0_ring_emit_wreg, > >>> .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > >>> + .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute, > >>> }; > >>> > >>> static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > >>> index e687363..968447d 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > >>> @@ -385,11 +385,19 @@ static uint64_t > >> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > >>> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid), > >>> upper_32_bits(pd_addr)); > >>> > >>> - amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); > >>> - > >>> - /* wait for the invalidate to complete */ > >>> - amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng, > >>> - 1 << vmid, 1 << vmid); > >>> + /* The world switch cannot be allowed to occur while > >>> + some invalidation controller code is waiting for an ack. > >>> + To workaround the hardware restriction, replace the original > >>> + two command with one command for compute ring */ > >>> + if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && > >> amdgpu_sriov_vf(adev)) { > >>> + amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req > >> + eng, > >>> + hub->vm_inv_eng0_ack + eng, req, 1 << > >> vmid); > >>> + } else { > >>> + amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, > >> req); > >>> + /* wait for the invalidate to complete */ > >>> + amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + > >> eng, > >>> + 1 << vmid, 1 << vmid); > >>> + } > >>> > >>> return pd_addr; > >>> } > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx at lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx