previously we always insert 128nops behind vm_flush, which may lead to DAMframe size above 256 dw and automatially aligned to 512 dw. now we calculate how many DWs already inserted after vm_flush and make up for the reset to pad up to 128dws before emit_ib. that way we only take 256 dw per submit. v2: drop the 128nop inserting in gfx_v8_vm_flush and the estimated frame size should minor those between vm_flush and emit_ib, since we already consdier vm_flush will take 128 + 19 DWs. Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650 Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 25 ++++++++++++++++++++----- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 9129b8c..e91f227 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -165,6 +165,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, patch_offset = amdgpu_ring_init_cond_exec(ring); need_ctx_switch = ring->current_ctx != fence_ctx; + ring->dws_between_vm_ib = 0; /* clear before recalculate */ if (vm) { r = amdgpu_vm_flush(ring, job); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index c813cbe..1dbe600 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -173,6 +173,7 @@ struct amdgpu_ring { #if defined(CONFIG_DEBUG_FS) struct dentry *ent; #endif + u32 dws_between_vm_ib; }; int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 5f37313..5e8e4eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -5670,6 +5670,8 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring, amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa); amdgpu_ring_write(ring, 0); amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base)); + + ring->dws_between_vm_ib += 20; } static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address) @@ -6489,6 +6491,8 @@ static void gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring) amdgpu_ring_write(ring, ref_and_mask); amdgpu_ring_write(ring, ref_and_mask); amdgpu_ring_write(ring, 0x20); /* poll interval */ + + ring->dws_between_vm_ib += 7; } static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring) @@ -6500,6 +6504,8 @@ static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring) amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0)); amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) | EVENT_INDEX(0)); + + ring->dws_between_vm_ib += 4; } @@ -6573,6 +6579,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr, amdgpu_ring_write(ring, lower_32_bits(seq)); amdgpu_ring_write(ring, upper_32_bits(seq)); + ring->dws_between_vm_ib += 6; } static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) @@ -6636,8 +6643,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* sync PFP to ME, otherwise we might get invalid PFP reads */ amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); amdgpu_ring_write(ring, 0x0); - /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */ - amdgpu_ring_insert_nop(ring, 128); } } @@ -6711,9 +6716,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; - if (amdgpu_sriov_vf(ring->adev)) + if (amdgpu_sriov_vf(ring->adev)) { gfx_v8_0_ring_emit_ce_meta_init(ring, (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr); + ring->dws_between_vm_ib += 8; + } dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ if (flags & AMDGPU_HAVE_CTX_SWITCH) { @@ -6739,10 +6746,17 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1)); amdgpu_ring_write(ring, dw2); amdgpu_ring_write(ring, 0); + ring->dws_between_vm_ib += 3; - if (amdgpu_sriov_vf(ring->adev)) + if (amdgpu_sriov_vf(ring->adev)) { gfx_v8_0_ring_emit_de_meta_init(ring, (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr); + ring->dws_between_vm_ib += 21; + } + + /* We need to pad some NOPs before emit_ib to prevent CE run ahead of + * vm_flush, which may trigger VM fault. */ + amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib); } static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg) @@ -7018,7 +7032,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = { 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */ 2 + /* gfx_v8_ring_emit_sb */ - 3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */ + 3 + 4 + 29 - /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */ + 20 - 7 - 6 - 3 - 4 - 29, /* no need to count gds/hdp_flush/vm_flush fence/cntx_cntl/vgt_flush/meta-data anymore */ .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */ .emit_ib = gfx_v8_0_ring_emit_ib_gfx, .emit_fence = gfx_v8_0_ring_emit_fence_gfx, -- 2.7.4