I decide to cancel this fix, because it although can avoid the SAVE_VF fail, but the next round driver loading will fail on KIQ, unless there is a VF FLR inserted before driver loading, so I had another w/a for it, which allocates MQD of KIQ on VRAM domain Already sent out for review, thanks ! -----Original Message----- From: Deucher, Alexander Sent: 2017å¹´9æ??21æ?¥ 14:58 To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org Cc: Chen, Horace <Horace.Chen at amd.com>; Liu, Monk <Monk.Liu at amd.com> Subject: RE: [PATCH] drm/amdgpu/sriov:fix driver unloading bug > -----Original Message----- > From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf > Of Monk Liu > Sent: Wednesday, September 20, 2017 5:28 AM > To: amd-gfx at lists.freedesktop.org > Cc: Chen, Horace; Liu, Monk > Subject: [PATCH] drm/amdgpu/sriov:fix driver unloading bug > > Fix hypervisor save_vf fail issue which hit after guest drv unloaded. > > the reason of SAVE_VF will fail is: > KIQ and KCQ still active after drv unloaded, RLCV will command CPC to > run MQD (to save current status) on all queues if they are still > active > > the fix is to unmap KCQ and disable KIQ/HIQ in gfx fini, thus we > implement new routines to unmap KCQ for gfx8/9, and we disable KIQ/HIQ > in RLC registers thus RLCV won't initiate CPC do the MQD commands. > > Change-Id: I95eb650f4bd16b639ca6e773efce80abb5e04641 > Signed-off-by: Horace Chen <horace.chen at amd.com> > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 63 > ++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 64 > +++++++++++++++++++++++++++++++++++ > 2 files changed, 127 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 0c4a3b8..14be0bd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -5034,6 +5034,57 @@ static int gfx_v8_0_hw_init(void *handle) > return r; > } > > +static int gfx_v8_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct > amdgpu_ring *ring) > +{ > + struct amdgpu_device *adev = kiq_ring->adev; > + uint32_t scratch, tmp = 0; > + int r, i; > + > + r = amdgpu_gfx_scratch_get(adev, &scratch); > + if (r) { > + DRM_ERROR("Failed to get scratch reg (%d).\n", r); > + return r; > + } > + WREG32(scratch, 0xCAFEDEAD); > + > + r = amdgpu_ring_alloc(kiq_ring, 10); > + if (r) { > + DRM_ERROR("Failed to lock KIQ (%d).\n", r); > + amdgpu_gfx_scratch_free(adev, scratch); > + return r; > + } > + > + /* unmap queues */ > + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, > 4)); > + amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: > 1 */ > + > PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */ > + > PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) | > + > PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) | > + > PACKET3_UNMAP_QUEUES_NUM_QUEUES(1)); > + amdgpu_ring_write(kiq_ring, > PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); > + amdgpu_ring_write(kiq_ring, 0); > + amdgpu_ring_write(kiq_ring, 0); > + amdgpu_ring_write(kiq_ring, 0); > + /* write to scratch for completion */ > + amdgpu_ring_write(kiq_ring, > PACKET3(PACKET3_SET_UCONFIG_REG, 1)); > + amdgpu_ring_write(kiq_ring, (scratch - > PACKET3_SET_UCONFIG_REG_START)); > + amdgpu_ring_write(kiq_ring, 0xDEADBEEF); > + amdgpu_ring_commit(kiq_ring); > + > + for (i = 0; i < adev->usec_timeout; i++) { > + tmp = RREG32(scratch); > + if (tmp == 0xDEADBEEF) > + break; > + DRM_UDELAY(1); > + } > + if (i >= adev->usec_timeout) { > + DRM_ERROR("KCQ disabled failed > (scratch(0x%04X)=0x%08X)\n", scratch, tmp); > + r = -EINVAL; > + } > + amdgpu_gfx_scratch_free(adev, scratch); > + return r; > +} > + > static int gfx_v8_0_hw_fini(void *handle) { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ > -5041,6 +5092,18 @@ static int gfx_v8_0_hw_fini(void *handle) > amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); > amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); > if (amdgpu_sriov_vf(adev)) { > + uint32_t value; > + int i; > + > + /* disable KCQ to avoid CPC touch memory not valid anymore > */ > + for (i = 0; i < adev->gfx.num_compute_rings; i++) > + gfx_v8_0_kcq_disable(&adev->gfx.kiq.ring, &adev- > >gfx.compute_ring[i]); > + > + /* disable KIQ & HIQ */ > + value = RREG32(mmRLC_CP_SCHEDULERS); > + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, > scheduler0, 0); > + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, > scheduler1, 0); > + WREG32(mmRLC_CP_SCHEDULERS,value); Please make sure this won't break the KFD. IIRC, they shut down the HIQ already in KFD tear down. Same for gfx9. Alex > pr_debug("For SRIOV client, shouldn't do anything.\n"); > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index e2ae00d..f1f34a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -2895,6 +2895,58 @@ static int gfx_v9_0_hw_init(void *handle) > return r; > } > > +static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct > amdgpu_ring *ring) > +{ > + struct amdgpu_device *adev = kiq_ring->adev; > + uint32_t scratch, tmp = 0; > + int r, i; > + > + r = amdgpu_gfx_scratch_get(adev, &scratch); > + if (r) { > + DRM_ERROR("Failed to get scratch reg (%d).\n", r); > + return r; > + } > + WREG32(scratch, 0xCAFEDEAD); > + > + r = amdgpu_ring_alloc(kiq_ring, 10); > + if (r) { > + DRM_ERROR("Failed to lock KIQ (%d).\n", r); > + amdgpu_gfx_scratch_free(adev, scratch); > + return r; > + } > + > + /* unmap queues */ > + amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, > 4)); > + amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: > 1 */ > + > PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */ > + > PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) | > + > PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) | > + > PACKET3_UNMAP_QUEUES_NUM_QUEUES(1)); > + amdgpu_ring_write(kiq_ring, > PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); > + amdgpu_ring_write(kiq_ring, 0); > + amdgpu_ring_write(kiq_ring, 0); > + amdgpu_ring_write(kiq_ring, 0); > + /* write to scratch for completion */ > + amdgpu_ring_write(kiq_ring, > PACKET3(PACKET3_SET_UCONFIG_REG, 1)); > + amdgpu_ring_write(kiq_ring, (scratch - > PACKET3_SET_UCONFIG_REG_START)); > + amdgpu_ring_write(kiq_ring, 0xDEADBEEF); > + amdgpu_ring_commit(kiq_ring); > + > + for (i = 0; i < adev->usec_timeout; i++) { > + tmp = RREG32(scratch); > + if (tmp == 0xDEADBEEF) > + break; > + DRM_UDELAY(1); > + } > + if (i >= adev->usec_timeout) { > + DRM_ERROR("KCQ disabled failed > (scratch(0x%04X)=0x%08X)\n", scratch, tmp); > + r = -EINVAL; > + } > + amdgpu_gfx_scratch_free(adev, scratch); > + return r; > +} > + > + > static int gfx_v9_0_hw_fini(void *handle) { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ > -2902,6 +2954,18 @@ static int gfx_v9_0_hw_fini(void *handle) > amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); > amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); > if (amdgpu_sriov_vf(adev)) { > + uint32_t value; > + int i; > + > + /* disable KCQ to avoid CPC touch memory not valid anymore > */ > + for (i = 0; i < adev->gfx.num_compute_rings; i++) > + gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev- > >gfx.compute_ring[i]); > + > + /* disable KIQ & HIQ */ > + value = RREG32(mmRLC_CP_SCHEDULERS); > + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, > scheduler0, 0); > + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, > scheduler1, 0); > + WREG32(mmRLC_CP_SCHEDULERS,value); > pr_debug("For SRIOV client, shouldn't do anything.\n"); > return 0; > } > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx