I try to write a patch based on the patch of Tuikov,Luben. Inspired by Luben,here is the patch: From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001 From: changzhu <Changfeng.Zhu@xxxxxxx> Date: Thu, 10 Oct 2019 11:02:33 +0800 Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status registers The GRBM register interface is now capable of bursting 1 cycle per register wr->wr, wr->rd much faster than previous muticycle per transaction done interface. This has caused a problem where status registers requiring HW to update have a 1 cycle delay, due to the register update having to go through GRBM. SW may operate on an incorrect value if they write a register and immediately check the corresponding status register. Registers requiring HW to clear or set fields may be delayed by 1 cycle. For example, 1. write VM_INVALIDATE_ENG0_REQ mask = 5a 2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is complete 3. write VM_INVALIDATE_ENG0_REQ mask = 5a 4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0 b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w register takes one extra cycle to be cleared c. In this case,SW wil see a false ACK if they exit on first read Affected registers (only GC variant) | Recommended Dummy Read --------------------------------------+---------------------------- VM_INVALIDATE_ENG*_ACK | VM_INVALIDATE_ENG*_REQ VM_L2_STATUS | VM_L2_STATUS VM_L2_PROTECTION_FAULT_STATUS | VM_L2_PROTECTION_FAULT_STATUS VM_L2_PROTECTION_FAULT_ADDR_HI/LO32 | VM_L2_PROTECTION_FAULT_ADDR_HI/LO32 VM_L2_IH_LOG_BUSY | VM_L2_IH_LOG_BUSY MC_VM_L2_PERFCOUNTER_HI/LO | MC_VM_L2_PERFCOUNTER_HI/LO ATC_L2_PERFCOUNTER_HI/LO | ATC_L2_PERFCOUNTER_HI/LO ATC_L2_PERFCOUNTER2_HI/LO | ATC_L2_PERFCOUNTER2_HI/LO It also needs dummy read by engines for these gc registers. Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3 Signed-off-by: changzhu <Changfeng.Zhu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 18 ++++++++++++++++++ 5 files changed, 31 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 4b3f58dbf36f..c2fbf6087ecf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, uint32_t ref, uint32_t mask) { amdgpu_ring_emit_wreg(ring, reg0, ref); + + /* wait for a cycle to reset vm_inv_eng0_ack */ + if (ring->funcs->vmhub == AMDGPU_GFXHUB_0) + amdgpu_ring_emit_rreg(ring, reg0); + amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index ef1975a5323a..104c47734316 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { .patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec, .preempt_ib = gfx_v10_0_ring_preempt_ib, .emit_tmz = gfx_v10_0_ring_emit_tmz, + .emit_rreg = gfx_v10_0_ring_emit_rreg, .emit_wreg = gfx_v10_0_ring_emit_wreg, .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, }; @@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = { .test_ib = gfx_v10_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, + .emit_rreg = gfx_v10_0_ring_emit_rreg, .emit_wreg = gfx_v10_0_ring_emit_wreg, .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 2f03bf533d41..d00b53de0fdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec, .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec, .emit_tmz = gfx_v9_0_ring_emit_tmz, + .emit_rreg = gfx_v9_0_ring_emit_rreg, .emit_wreg = gfx_v9_0_ring_emit_wreg, .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, @@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, .set_priority = gfx_v9_0_ring_set_priority_compute, + .emit_rreg = gfx_v9_0_ring_emit_rreg, .emit_wreg = gfx_v9_0_ring_emit_wreg, .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 3b00bce14cfb..dce6b651da1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); + /* wait for a cycle to reset vm_inv_eng0_ack */ + if (ring->funcs->vmhub == AMDGPU_GFXHUB_0) + amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng); + /* wait for the invalidate to complete */ amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng, 1 << vmid, 1 << vmid); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 3460c00f3eaa..baaa33467882 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -38,6 +38,7 @@ #include "navi10_sdma_pkt_open.h" #include "nbio_v2_3.h" #include "sdma_v5_0.h" +#include "nvd.h" MODULE_FIRMWARE("amdgpu/navi10_sdma.bin"); MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin"); @@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct amdgpu_ring *ring, amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr); } +static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg) +{ + struct amdgpu_device *adev = ring->adev; + + amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4)); + amdgpu_ring_write(ring, 0 | /* src: register*/ + (5 << 8) | /* dst: memory */ + (1 << 20)); /* write confirm */ + amdgpu_ring_write(ring, reg); + amdgpu_ring_write(ring, 0); + amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr + + adev->virt.reg_val_offs * 4)); + amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr + + adev->virt.reg_val_offs * 4)); +} + static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val) { @@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = { .test_ib = sdma_v5_0_ring_test_ib, .insert_nop = sdma_v5_0_ring_insert_nop, .pad_ib = sdma_v5_0_ring_pad_ib, + .emit_rreg = sdma_v5_0_ring_emit_rreg, .emit_wreg = sdma_v5_0_ring_emit_wreg, .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait, .init_cond_exec = sdma_v5_0_ring_init_cond_exec, -- 2.17.1 Could someone give some suggestions about it? BR, Changfeng. -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Huang, Ray Sent: Friday, October 25, 2019 5:26 PM To: Tuikov, Luben <Luben.Tuikov@xxxxxxx> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote: > The GRBM interface is now capable of bursting 1-cycle op per register, > a WRITE followed by another WRITE, or a WRITE followed by a READ--much > faster than previous muti-cycle per completed-transaction interface. > This causes a problem, whereby status registers requiring a read/write > by hardware, have a 1-cycle delay, due to the register update having > to go through GRBM interface. > > This patch adds this delay. > > A one cycle read op is added after updating the invalidate request and > before reading the invalidate-ACK status. > > See also commit > 534991731cb5fa94b5519957646cf849ca10d17d. > > Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++++++ > drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +- > 5 files changed, 22 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index ac43b1af69e3..0042868dbd53 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { > 5 + /* COND_EXEC */ > 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 + > 2 + /* VM_FLUSH */ > 8 + /* FENCE for VM_FLUSH */ > 20 + /* GDS switch */ > @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = { > 5 + /* hdp invalidate */ > 7 + /* gfx_v10_0_ring_emit_pipeline_sync */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 + > 2 + /* gfx_v10_0_ring_emit_vm_flush */ > 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */ > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 9fe95e7693d5..9a7a717208de 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { > 5 + /* COND_EXEC */ > 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 + > 2 + /* VM_FLUSH */ > 8 + /* FENCE for VM_FLUSH */ > 20 + /* GDS switch */ > @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { > 5 + /* hdp invalidate */ > 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 + > 2 + /* gfx_v9_0_ring_emit_vm_flush */ > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index 6e1b25bd1fe7..100d526e9a42 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -346,6 +346,15 @@ static uint64_t > gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > > amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req); > > + /* Insert a dummy read to delay one cycle before the ACK > + * inquiry. > + */ > + if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA || > + ring->funcs->type == AMDGPU_RING_TYPE_GFX || > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) > + amdgpu_ring_emit_reg_wait(ring, > + hub->vm_inv_eng0_req + eng, 0, 0); > + > /* wait for the invalidate to complete */ > amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng, > 1 << vmid, 1 << vmid); > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 9f2a893871ec..8f3097e45299 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, > amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid), > upper_32_bits(pd_addr)); > > + /* Insert a dummy read to delay one cycle before the ACK > + * inquiry. > + */ > + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX || > + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) > + amdgpu_ring_emit_reg_wait(ring, > + hub->vm_inv_eng0_req + eng, 0, 0); The workaround should be add a dummy read (one cycle delay) after we write VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK. If you add it here, that cannot resolve the issue. I think you should implement the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait(). Thanks, Ray > + > amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng, > hub->vm_inv_eng0_ack + eng, > req, 1 << vmid); > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > index b8fdb192f6d6..0c41b4fdc58b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = { > 6 + /* sdma_v5_0_ring_emit_pipeline_sync */ > /* sdma_v5_0_ring_emit_vm_flush */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 + > + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 + > 10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */ > .emit_ib = sdma_v5_0_ring_emit_ib, > -- > 2.23.0.385.gbc12974a89 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Attachment:
0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch
Description: 0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch
_______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx