for GFX 8, originally we use double switch_buffer to prevents CE go ahead of DE, thus it can avoid VM fault in case of VM_flush not finish. but double SWITCH_BUFFER drops performance, and world switch preemption requires that only one SWITCH_BUFFER is needed at the end of DMAframe. to Pevent CE vm fault without double switch_buffer, we need to insert 128 dw NOP after vm flush because CE and DE can have at most 128 DW gap from hw perspective. Even no context switch, SWITCH_BUFFER is encouraged to use to get better CE performance. Change-Id: Ifdabf23f97e74156c1f66dddd0918ea7ffcddb20 Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 8 ++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 28 +++++++++++++--------------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cb0098a..a935831 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -338,6 +338,7 @@ struct amdgpu_ring_funcs { void (*end_use)(struct amdgpu_ring *ring); void (*emit_wreg) (struct amdgpu_ring *ring, uint32_t offset, uint32_t val); void (*emit_rreg) (struct amdgpu_ring *ring, uint32_t offset); + void (*emit_switch_buffer) (struct amdgpu_ring *ring); }; /* @@ -2372,6 +2373,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r)) #define amdgpu_ring_emit_wreg(r, i, v) (r)->funcs->emit_wreg((r), (i), (v)) #define amdgpu_ring_emit_rreg(r, i) (r)->funcs->emit_rreg((r), (i)) +#define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r)) #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index a31d7ef..4e5b2f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -210,6 +210,14 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, amdgpu_ring_patch_cond_exec(ring, patch_offset); ring->current_ctx = ctx; + + /* Insert one SB at the end of DMAframe, + * Now only GFX8 ip need handling like this, CI won't + * insert one SB at this place, instead it insert double + * switch buffer after VM FLUSH and PIPELINE sync. + */ + if (ring->funcs->emit_switch_buffer) + amdgpu_ring_emit_switch_buffer(ring); amdgpu_ring_commit(ring); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index dfa2288..41a2ef1 100755 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -5936,12 +5936,6 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, { u32 header, control = 0; - /* insert SWITCH_BUFFER packet before first IB in the ring frame */ - if (ctx_switch) { - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); - amdgpu_ring_write(ring, 0); - } - if (ib->flags & AMDGPU_IB_FLAG_CE) header = PACKET3(PACKET3_INDIRECT_BUFFER_CONST, 2); else @@ -6013,11 +6007,9 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) amdgpu_ring_write(ring, 4); /* poll interval */ if (usepfp) { - /* synce CE with ME to prevent CE fetch CEIB before context switch done */ - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); - amdgpu_ring_write(ring, 0); - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); - amdgpu_ring_write(ring, 0); + /* sync PFP to ME, otherwise we might get invalid PFP reads */ + amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); + amdgpu_ring_write(ring, 0x0); } } @@ -6065,11 +6057,10 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* sync PFP to ME, otherwise we might get invalid PFP reads */ amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); amdgpu_ring_write(ring, 0x0); - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); - amdgpu_ring_write(ring, 0); - amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); - amdgpu_ring_write(ring, 0); } + + /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */ + amdgpu_ring_insert_nop(ring, 128); } static u32 gfx_v8_0_ring_get_rptr_compute(struct amdgpu_ring *ring) @@ -6170,6 +6161,12 @@ static void gfx_v8_0_ring_emit_wreg_kiq(struct amdgpu_ring *ring, u32 idx, u32 v amdgpu_ring_write(ring, val); } +static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring) +{ + amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0)); + amdgpu_ring_write(ring, 0); +} + static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev, enum amdgpu_interrupt_state state) { @@ -6477,6 +6474,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = { .test_ib = gfx_v8_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, + .emit_switch_buffer = gfx_v8_ring_emit_sb, }; static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { -- 1.9.1