From: Nicolai Hähnle <nicolai.haehnle@xxxxxxx> Bring the code in line with what the radeon module does. Without this change, the fence following the IB may be signalled to the CPU even though some data written by shaders may not have been written back yet. This change fixes the OpenGL CTS test GL45-CTS.gtf32.GL3Tests.packed_pixels.packed_pixels_pbo on Verde. Signed-off-by: Nicolai Hähnle <nicolai.haehnle at amd.com> -- So, I'm not too familiar with the details of these syncs, but the radeon module effectively does: - IB - SURFACE_SYNC (vm_id of IB) - SURFACE_SYNC (vm_id == 0) - EVENT_WRITE_EOP (kernel fence) While amdgpu now does (with this change): - IB - SURFACE_SYNC (vm_id of IB) <-- this was missing - SURFACE_SYNC (vm_id == 0) - EVENT_WRITE_EOP (kernel fence) - SURFACE_SYNC (vm_id == 0) - EVENT_WRITE_EOP (user fence) It seems like at least the second SURFACE_SYNC (vm_id == 0) should be redundant, so the question is whether the SURFACE_SYNC (vm_id == 0) should be rearranged somehow, perhaps also added to the IB emission. But for better bisectability, that should probably be a separate change. Cheers, Nicolai --- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 03d2a0a..c4f444d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -1922,20 +1922,35 @@ static void gfx_v6_0_ring_emit_ib(struct amdgpu_ring *ring, control |= ib->length_dw | (vm_id << 24); amdgpu_ring_write(ring, header); amdgpu_ring_write(ring, #ifdef __BIG_ENDIAN (2 << 0) | #endif (ib->gpu_addr & 0xFFFFFFFC)); amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF); amdgpu_ring_write(ring, control); + + if (!(ib->flags & AMDGPU_IB_FLAG_CE)) { + /* flush read cache over gart for this vmid */ + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, (mmCP_COHER_CNTL2 - PACKET3_SET_CONFIG_REG_START)); + amdgpu_ring_write(ring, vm_id); + amdgpu_ring_write(ring, PACKET3(PACKET3_SURFACE_SYNC, 3)); + amdgpu_ring_write(ring, PACKET3_TCL1_ACTION_ENA | + PACKET3_TC_ACTION_ENA | + PACKET3_SH_KCACHE_ACTION_ENA | + PACKET3_SH_ICACHE_ACTION_ENA); + amdgpu_ring_write(ring, 0xFFFFFFFF); + amdgpu_ring_write(ring, 0); + amdgpu_ring_write(ring, 10); /* poll interval */ + } } /** * gfx_v6_0_ring_test_ib - basic ring IB test * * @ring: amdgpu_ring structure holding ring information * * Allocate an IB and execute it on the gfx ring (SI). * Provides a basic gfx ring test to verify that IBs are working. * Returns 0 on success, error on failure. @@ -3631,21 +3646,21 @@ static const struct amdgpu_ring_funcs gfx_v6_0_ring_funcs_gfx = { .get_rptr = gfx_v6_0_ring_get_rptr, .get_wptr = gfx_v6_0_ring_get_wptr, .set_wptr = gfx_v6_0_ring_set_wptr_gfx, .emit_frame_size = 5 + /* gfx_v6_0_ring_emit_hdp_flush */ 5 + /* gfx_v6_0_ring_emit_hdp_invalidate */ 14 + 14 + 14 + /* gfx_v6_0_ring_emit_fence x3 for user fence, vm fence */ 7 + 4 + /* gfx_v6_0_ring_emit_pipeline_sync */ 17 + 6 + /* gfx_v6_0_ring_emit_vm_flush */ 3 + 2, /* gfx_v6_ring_emit_cntxcntl including vgt flush */ - .emit_ib_size = 6, /* gfx_v6_0_ring_emit_ib */ + .emit_ib_size = 6 + 8, /* gfx_v6_0_ring_emit_ib */ .emit_ib = gfx_v6_0_ring_emit_ib, .emit_fence = gfx_v6_0_ring_emit_fence, .emit_pipeline_sync = gfx_v6_0_ring_emit_pipeline_sync, .emit_vm_flush = gfx_v6_0_ring_emit_vm_flush, .emit_hdp_flush = gfx_v6_0_ring_emit_hdp_flush, .emit_hdp_invalidate = gfx_v6_0_ring_emit_hdp_invalidate, .test_ring = gfx_v6_0_ring_test_ring, .test_ib = gfx_v6_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .emit_cntxcntl = gfx_v6_ring_emit_cntxcntl, @@ -3657,21 +3672,21 @@ static const struct amdgpu_ring_funcs gfx_v6_0_ring_funcs_compute = { .nop = 0x80000000, .get_rptr = gfx_v6_0_ring_get_rptr, .get_wptr = gfx_v6_0_ring_get_wptr, .set_wptr = gfx_v6_0_ring_set_wptr_compute, .emit_frame_size = 5 + /* gfx_v6_0_ring_emit_hdp_flush */ 5 + /* gfx_v6_0_ring_emit_hdp_invalidate */ 7 + /* gfx_v6_0_ring_emit_pipeline_sync */ 17 + /* gfx_v6_0_ring_emit_vm_flush */ 14 + 14 + 14, /* gfx_v6_0_ring_emit_fence x3 for user fence, vm fence */ - .emit_ib_size = 6, /* gfx_v6_0_ring_emit_ib */ + .emit_ib_size = 6 + 8, /* gfx_v6_0_ring_emit_ib */ .emit_ib = gfx_v6_0_ring_emit_ib, .emit_fence = gfx_v6_0_ring_emit_fence, .emit_pipeline_sync = gfx_v6_0_ring_emit_pipeline_sync, .emit_vm_flush = gfx_v6_0_ring_emit_vm_flush, .emit_hdp_flush = gfx_v6_0_ring_emit_hdp_flush, .emit_hdp_invalidate = gfx_v6_0_ring_emit_hdp_invalidate, .test_ring = gfx_v6_0_ring_test_ring, .test_ib = gfx_v6_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, }; -- 2.9.3