Am 03.04.2018 um 22:25 schrieb Marek Olšák: > From: Marek Olšák <marek.olsak at amd.com> > > There is a new IB flag that enables this new behavior. > Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense > when draw calls from two adjacent gfx IBs run in parallel. This will be > the new default for Mesa. > > v2: bump the version > > Signed-off-by: Marek Olšák <marek.olsak at amd.com> Looks good to me, but I would split it into two patches. One which implements all the stuff in the common code and the second implementing the handling in gfx_v9_0.c and bumping the version number. But that's only nice to have, the patch is Reviewed-by: Christian König <christian.koenig at amd.com> anyway. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 5 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 8 ++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 +++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +++++++---- > drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 + > include/uapi/drm/amdgpu_drm.h | 4 ++++ > 8 files changed, 27 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 43df7d2aebb4..0a45f5cceba7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -68,23 +68,24 @@ > * - 3.16.0 - Add reserved vmid support > * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS. > * - 3.18.0 - Export gpu always on cu bitmap > * - 3.19.0 - Add support for UVD MJPEG decode > * - 3.20.0 - Add support for local BOs > * - 3.21.0 - Add DRM_AMDGPU_FENCE_TO_HANDLE ioctl > * - 3.22.0 - Add DRM_AMDGPU_SCHED ioctl > * - 3.23.0 - Add query for VRAM lost counter > * - 3.24.0 - Add high priority compute support for gfx9 > * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk). > + * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. > */ > #define KMS_DRIVER_MAJOR 3 > -#define KMS_DRIVER_MINOR 25 > +#define KMS_DRIVER_MINOR 26 > #define KMS_DRIVER_PATCHLEVEL 0 > > int amdgpu_vram_limit = 0; > int amdgpu_vis_vram_limit = 0; > int amdgpu_gart_size = -1; /* auto */ > int amdgpu_gtt_size = -1; /* auto */ > int amdgpu_moverate = -1; /* auto */ > int amdgpu_benchmarking = 0; > int amdgpu_testing = 0; > int amdgpu_audio = -1; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 97449e06a242..d09fcab2398f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -124,39 +124,40 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring) > > /** > * amdgpu_fence_emit - emit a fence on the requested ring > * > * @ring: ring the fence is associated with > * @f: resulting fence object > * > * Emits a fence command on the requested ring (all asics). > * Returns 0 on success, -ENOMEM on failure. > */ > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f) > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, > + unsigned flags) > { > struct amdgpu_device *adev = ring->adev; > struct amdgpu_fence *fence; > struct dma_fence *old, **ptr; > uint32_t seq; > > fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL); > if (fence == NULL) > return -ENOMEM; > > seq = ++ring->fence_drv.sync_seq; > fence->ring = ring; > dma_fence_init(&fence->base, &amdgpu_fence_ops, > &ring->fence_drv.lock, > adev->fence_context + ring->idx, > seq); > amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > - seq, AMDGPU_FENCE_FLAG_INT); > + seq, flags | AMDGPU_FENCE_FLAG_INT); > > ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask]; > /* This function can't be called concurrently anyway, otherwise > * emitting the fence would mess up the hardware ring buffer. > */ > old = rcu_dereference_protected(*ptr, 1); > if (old && !dma_fence_is_signaled(old)) { > DRM_INFO("rcu slot is busy\n"); > dma_fence_wait(old, false); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > index 311589e02d17..f70eeed9ed76 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > @@ -120,20 +120,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > struct dma_fence **f) > { > struct amdgpu_device *adev = ring->adev; > struct amdgpu_ib *ib = &ibs[0]; > struct dma_fence *tmp = NULL; > bool skip_preamble, need_ctx_switch; > unsigned patch_offset = ~0; > struct amdgpu_vm *vm; > uint64_t fence_ctx; > uint32_t status = 0, alloc_size; > + unsigned fence_flags = 0; > > unsigned i; > int r = 0; > bool need_pipe_sync = false; > > if (num_ibs == 0) > return -EINVAL; > > /* ring tests don't use a job */ > if (job) { > @@ -220,36 +221,39 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > } > > if (ring->funcs->emit_tmz) > amdgpu_ring_emit_tmz(ring, false); > > #ifdef CONFIG_X86_64 > if (!(adev->flags & AMD_IS_APU)) > #endif > amdgpu_asic_invalidate_hdp(adev, ring); > > - r = amdgpu_fence_emit(ring, f); > + if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE) > + fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY; > + > + r = amdgpu_fence_emit(ring, f, fence_flags); > if (r) { > dev_err(adev->dev, "failed to emit fence (%d)\n", r); > if (job && job->vmid) > amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid); > amdgpu_ring_undo(ring); > return r; > } > > if (ring->funcs->insert_end) > ring->funcs->insert_end(ring); > > /* wrap the last IB with fence */ > if (job && job->uf_addr) { > amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence, > - AMDGPU_FENCE_FLAG_64BIT); > + fence_flags | AMDGPU_FENCE_FLAG_64BIT); > } > > if (patch_offset != ~0 && ring->funcs->patch_cond_exec) > amdgpu_ring_patch_cond_exec(ring, patch_offset); > > ring->current_ctx = fence_ctx; > if (vm && ring->funcs->emit_switch_buffer) > amdgpu_ring_emit_switch_buffer(ring); > amdgpu_ring_commit(ring); > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 1d0d250cbfdf..222052daedd1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -33,20 +33,21 @@ > #define AMDGPU_MAX_COMPUTE_RINGS 8 > #define AMDGPU_MAX_VCE_RINGS 3 > #define AMDGPU_MAX_UVD_ENC_RINGS 2 > > /* some special values for the owner field */ > #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) > #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) > > #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) > #define AMDGPU_FENCE_FLAG_INT (1 << 1) > +#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) > > enum amdgpu_ring_type { > AMDGPU_RING_TYPE_GFX, > AMDGPU_RING_TYPE_COMPUTE, > AMDGPU_RING_TYPE_SDMA, > AMDGPU_RING_TYPE_UVD, > AMDGPU_RING_TYPE_VCE, > AMDGPU_RING_TYPE_KIQ, > AMDGPU_RING_TYPE_UVD_ENC, > AMDGPU_RING_TYPE_VCN_DEC, > @@ -81,21 +82,22 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev); > void amdgpu_fence_driver_fini(struct amdgpu_device *adev); > void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); > > int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > unsigned num_hw_submission); > int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, > struct amdgpu_irq_src *irq_src, > unsigned irq_type); > void amdgpu_fence_driver_suspend(struct amdgpu_device *adev); > void amdgpu_fence_driver_resume(struct amdgpu_device *adev); > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence); > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, > + unsigned flags); > int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s); > void amdgpu_fence_process(struct amdgpu_ring *ring); > int amdgpu_fence_wait_empty(struct amdgpu_ring *ring); > signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring, > uint32_t wait_seq, > signed long timeout); > unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring); > > /* > * Rings. > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 24474294c92a..fe05351ea4d2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -620,21 +620,21 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_ > > if (vm_flush_needed) { > trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr); > amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr); > } > > if (pasid_mapping_needed) > amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid); > > if (vm_flush_needed || pasid_mapping_needed) { > - r = amdgpu_fence_emit(ring, &fence); > + r = amdgpu_fence_emit(ring, &fence, 0); > if (r) > return r; > } > > if (vm_flush_needed) { > mutex_lock(&id_mgr->lock); > dma_fence_put(id->last_flush); > id->last_flush = dma_fence_get(fence); > id->current_gpu_reset_count = > atomic_read(&adev->gpu_reset_counter); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 9d39fd5b1822..5dea0d4c0af4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -3767,27 +3767,30 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, > lower_32_bits(ib->gpu_addr)); > amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr)); > amdgpu_ring_write(ring, control); > } > > static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, > u64 seq, unsigned flags) > { > bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; > bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; > + bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY; > > /* RELEASE_MEM - flush caches, send int */ > amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); > - amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN | > - EOP_TC_ACTION_EN | > - EOP_TC_WB_ACTION_EN | > - EOP_TC_MD_ACTION_EN | > + amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN | > + EOP_TC_NC_ACTION_EN) : > + (EOP_TCL1_ACTION_EN | > + EOP_TC_ACTION_EN | > + EOP_TC_WB_ACTION_EN | > + EOP_TC_MD_ACTION_EN)) | > EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | > EVENT_INDEX(5))); > amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); > > /* > * the address should be Qword aligned if 64bit write, Dword > * aligned if only send 32bit data low (discard data high) > */ > if (write64bit) > BUG_ON(addr & 0x7); > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h > index 7f408f85fdb6..839a144c1645 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h > +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h > @@ -152,20 +152,21 @@ > * 4 - *S_PARTIAL_FLUSH > */ > #define PACKET3_RELEASE_MEM 0x49 > #define EVENT_TYPE(x) ((x) << 0) > #define EVENT_INDEX(x) ((x) << 8) > #define EOP_TCL1_VOL_ACTION_EN (1 << 12) > #define EOP_TC_VOL_ACTION_EN (1 << 13) /* L2 */ > #define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */ > #define EOP_TCL1_ACTION_EN (1 << 16) > #define EOP_TC_ACTION_EN (1 << 17) /* L2 */ > +#define EOP_TC_NC_ACTION_EN (1 << 19) > #define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */ > > #define DATA_SEL(x) ((x) << 29) > /* 0 - discard > * 1 - send low 32bit data > * 2 - send 64bit data > * 3 - send 64bit GPU counter value > * 4 - send 64bit sys counter value > */ > #define INT_SEL(x) ((x) << 24) > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 0087799962cf..f5901bd9c7d8 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -516,20 +516,24 @@ union drm_amdgpu_cs { > > /* This IB should be submitted to CE */ > #define AMDGPU_IB_FLAG_CE (1<<0) > > /* Preamble flag, which means the IB could be dropped if no context switch */ > #define AMDGPU_IB_FLAG_PREAMBLE (1<<1) > > /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ > #define AMDGPU_IB_FLAG_PREEMPT (1<<2) > > +/* The IB fence should do the L2 writeback but not invalidate any shader > + * caches (L2/vL1/sL1/I$). */ > +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) > + > struct drm_amdgpu_cs_chunk_ib { > __u32 _pad; > /** AMDGPU_IB_FLAG_* */ > __u32 flags; > /** Virtual address to begin IB execution */ > __u64 va_start; > /** Size of submission */ > __u32 ib_bytes; > /** HW IP to submit to */ > __u32 ip_type;