From: Marek Olšák <marek.olsak@xxxxxxx> There is a new IB flag that enables this new behavior. Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense when draw calls from two adjacent gfx IBs run in parallel. This will be the new default for Mesa. Signed-off-by: Marek Olšák <marek.olsak at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +++++++---- drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 + include/uapi/drm/amdgpu_drm.h | 4 ++++ 7 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 97449e06a242..d09fcab2398f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -124,39 +124,40 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring) /** * amdgpu_fence_emit - emit a fence on the requested ring * * @ring: ring the fence is associated with * @f: resulting fence object * * Emits a fence command on the requested ring (all asics). * Returns 0 on success, -ENOMEM on failure. */ -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f) +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, + unsigned flags) { struct amdgpu_device *adev = ring->adev; struct amdgpu_fence *fence; struct dma_fence *old, **ptr; uint32_t seq; fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL); if (fence == NULL) return -ENOMEM; seq = ++ring->fence_drv.sync_seq; fence->ring = ring; dma_fence_init(&fence->base, &amdgpu_fence_ops, &ring->fence_drv.lock, adev->fence_context + ring->idx, seq); amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, - seq, AMDGPU_FENCE_FLAG_INT); + seq, flags | AMDGPU_FENCE_FLAG_INT); ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask]; /* This function can't be called concurrently anyway, otherwise * emitting the fence would mess up the hardware ring buffer. */ old = rcu_dereference_protected(*ptr, 1); if (old && !dma_fence_is_signaled(old)) { DRM_INFO("rcu slot is busy\n"); dma_fence_wait(old, false); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 311589e02d17..f70eeed9ed76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -120,20 +120,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, struct dma_fence **f) { struct amdgpu_device *adev = ring->adev; struct amdgpu_ib *ib = &ibs[0]; struct dma_fence *tmp = NULL; bool skip_preamble, need_ctx_switch; unsigned patch_offset = ~0; struct amdgpu_vm *vm; uint64_t fence_ctx; uint32_t status = 0, alloc_size; + unsigned fence_flags = 0; unsigned i; int r = 0; bool need_pipe_sync = false; if (num_ibs == 0) return -EINVAL; /* ring tests don't use a job */ if (job) { @@ -220,36 +221,39 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, } if (ring->funcs->emit_tmz) amdgpu_ring_emit_tmz(ring, false); #ifdef CONFIG_X86_64 if (!(adev->flags & AMD_IS_APU)) #endif amdgpu_asic_invalidate_hdp(adev, ring); - r = amdgpu_fence_emit(ring, f); + if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE) + fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY; + + r = amdgpu_fence_emit(ring, f, fence_flags); if (r) { dev_err(adev->dev, "failed to emit fence (%d)\n", r); if (job && job->vmid) amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid); amdgpu_ring_undo(ring); return r; } if (ring->funcs->insert_end) ring->funcs->insert_end(ring); /* wrap the last IB with fence */ if (job && job->uf_addr) { amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence, - AMDGPU_FENCE_FLAG_64BIT); + fence_flags | AMDGPU_FENCE_FLAG_64BIT); } if (patch_offset != ~0 && ring->funcs->patch_cond_exec) amdgpu_ring_patch_cond_exec(ring, patch_offset); ring->current_ctx = fence_ctx; if (vm && ring->funcs->emit_switch_buffer) amdgpu_ring_emit_switch_buffer(ring); amdgpu_ring_commit(ring); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 1d0d250cbfdf..222052daedd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -33,20 +33,21 @@ #define AMDGPU_MAX_COMPUTE_RINGS 8 #define AMDGPU_MAX_VCE_RINGS 3 #define AMDGPU_MAX_UVD_ENC_RINGS 2 /* some special values for the owner field */ #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) #define AMDGPU_FENCE_FLAG_INT (1 << 1) +#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) enum amdgpu_ring_type { AMDGPU_RING_TYPE_GFX, AMDGPU_RING_TYPE_COMPUTE, AMDGPU_RING_TYPE_SDMA, AMDGPU_RING_TYPE_UVD, AMDGPU_RING_TYPE_VCE, AMDGPU_RING_TYPE_KIQ, AMDGPU_RING_TYPE_UVD_ENC, AMDGPU_RING_TYPE_VCN_DEC, @@ -81,21 +82,22 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev); void amdgpu_fence_driver_fini(struct amdgpu_device *adev); void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, unsigned num_hw_submission); int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, struct amdgpu_irq_src *irq_src, unsigned irq_type); void amdgpu_fence_driver_suspend(struct amdgpu_device *adev); void amdgpu_fence_driver_resume(struct amdgpu_device *adev); -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence); +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, + unsigned flags); int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s); void amdgpu_fence_process(struct amdgpu_ring *ring); int amdgpu_fence_wait_empty(struct amdgpu_ring *ring); signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring, uint32_t wait_seq, signed long timeout); unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring); /* * Rings. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 24474294c92a..fe05351ea4d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -620,21 +620,21 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_ if (vm_flush_needed) { trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr); amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr); } if (pasid_mapping_needed) amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid); if (vm_flush_needed || pasid_mapping_needed) { - r = amdgpu_fence_emit(ring, &fence); + r = amdgpu_fence_emit(ring, &fence, 0); if (r) return r; } if (vm_flush_needed) { mutex_lock(&id_mgr->lock); dma_fence_put(id->last_flush); id->last_flush = dma_fence_get(fence); id->current_gpu_reset_count = atomic_read(&adev->gpu_reset_counter); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9d39fd5b1822..5dea0d4c0af4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3767,27 +3767,30 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, lower_32_bits(ib->gpu_addr)); amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr)); amdgpu_ring_write(ring, control); } static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 seq, unsigned flags) { bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; + bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY; /* RELEASE_MEM - flush caches, send int */ amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); - amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN | - EOP_TC_ACTION_EN | - EOP_TC_WB_ACTION_EN | - EOP_TC_MD_ACTION_EN | + amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN | + EOP_TC_NC_ACTION_EN) : + (EOP_TCL1_ACTION_EN | + EOP_TC_ACTION_EN | + EOP_TC_WB_ACTION_EN | + EOP_TC_MD_ACTION_EN)) | EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5))); amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); /* * the address should be Qword aligned if 64bit write, Dword * aligned if only send 32bit data low (discard data high) */ if (write64bit) BUG_ON(addr & 0x7); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h index 7f408f85fdb6..839a144c1645 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h @@ -152,20 +152,21 @@ * 4 - *S_PARTIAL_FLUSH */ #define PACKET3_RELEASE_MEM 0x49 #define EVENT_TYPE(x) ((x) << 0) #define EVENT_INDEX(x) ((x) << 8) #define EOP_TCL1_VOL_ACTION_EN (1 << 12) #define EOP_TC_VOL_ACTION_EN (1 << 13) /* L2 */ #define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */ #define EOP_TCL1_ACTION_EN (1 << 16) #define EOP_TC_ACTION_EN (1 << 17) /* L2 */ +#define EOP_TC_NC_ACTION_EN (1 << 19) #define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */ #define DATA_SEL(x) ((x) << 29) /* 0 - discard * 1 - send low 32bit data * 2 - send 64bit data * 3 - send 64bit GPU counter value * 4 - send 64bit sys counter value */ #define INT_SEL(x) ((x) << 24) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 0087799962cf..f5901bd9c7d8 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -516,20 +516,24 @@ union drm_amdgpu_cs { /* This IB should be submitted to CE */ #define AMDGPU_IB_FLAG_CE (1<<0) /* Preamble flag, which means the IB could be dropped if no context switch */ #define AMDGPU_IB_FLAG_PREAMBLE (1<<1) /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ #define AMDGPU_IB_FLAG_PREEMPT (1<<2) +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ __u32 flags; /** Virtual address to begin IB execution */ __u64 va_start; /** Size of submission */ __u32 ib_bytes; /** HW IP to submit to */ __u32 ip_type; -- 2.15.1