From: Christian König <christian.koenig@xxxxxxx> The callback we installed for the SDMA update were actually pretty horrible. since we now have seq64 use that one and HW seq writes instead. V2:(Shashank) - rebased on amd-drm-staging-next - changed amdgpu_seq64_gpu_addr Cc: Christian König <christian.koenig@xxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> Cc: Felix Kuehling <felix.kuehling@xxxxxxx> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> Signed-off-by: Christian König <christian.koenig@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 14 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 79 ++++----------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 27 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 5 ++ 7 files changed, 42 insertions(+), 89 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index 3d0d56087d41..300dc79fa2b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -199,6 +199,20 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) __clear_bit(bit_pos, adev->seq64.used); } +/** + * amdgpu_seq64_gpu_addr - Calculate GPU addr from va + * + * @adev: amdgpu_device pointer + * @va: virtual address in client address space + * + * Calculate the GART address for a VA. + */ +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va) +{ + return va - amdgpu_seq64_get_va_base(adev) + + amdgpu_bo_gpu_offset(adev->seq64.sbo); +} + /** * amdgpu_seq64_fini - Cleanup seq64 driver * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h index 4203b2ab318d..63e8ac0a2057 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h @@ -43,6 +43,7 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 gpu_addr); int amdgpu_seq64_map(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo_va **bo_va); void amdgpu_seq64_unmap(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv); +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..0960e0a665d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -111,21 +111,6 @@ struct amdgpu_prt_cb { struct dma_fence_cb cb; }; -/** - * struct amdgpu_vm_tlb_seq_struct - Helper to increment the TLB flush sequence - */ -struct amdgpu_vm_tlb_seq_struct { - /** - * @vm: pointer to the amdgpu_vm structure to set the fence sequence on - */ - struct amdgpu_vm *vm; - - /** - * @cb: callback - */ - struct dma_fence_cb cb; -}; - /** * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping * @@ -862,23 +847,6 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, return r; } -/** - * amdgpu_vm_tlb_seq_cb - make sure to increment tlb sequence - * @fence: unused - * @cb: the callback structure - * - * Increments the tlb sequence to make sure that future CS execute a VM flush. - */ -static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, - struct dma_fence_cb *cb) -{ - struct amdgpu_vm_tlb_seq_struct *tlb_cb; - - tlb_cb = container_of(cb, typeof(*tlb_cb), cb); - atomic64_inc(&tlb_cb->vm->tlb_seq); - kfree(tlb_cb); -} - /** * amdgpu_vm_update_range - update a range in the vm page table * @@ -911,7 +879,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct dma_fence **fence) { struct amdgpu_vm_update_params params; - struct amdgpu_vm_tlb_seq_struct *tlb_cb; struct amdgpu_res_cursor cursor; enum amdgpu_sync_mode sync_mode; int r, idx; @@ -919,12 +886,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (!drm_dev_enter(adev_to_drm(adev), &idx)) return -ENODEV; - tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL); - if (!tlb_cb) { - r = -ENOMEM; - goto error_unlock; - } - /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache, * heavy-weight flush TLB unconditionally. */ @@ -942,6 +903,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, params.immediate = immediate; params.pages_addr = pages_addr; params.unlocked = unlocked; + params.needs_flush = flush_tlb; params.allow_override = allow_override; /* Implicitly sync to command submissions in the same VM before @@ -955,7 +917,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, amdgpu_vm_eviction_lock(vm); if (vm->evicting) { r = -EBUSY; - goto error_free; + goto error_unlock; } if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) { @@ -968,7 +930,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->prepare(¶ms, resv, sync_mode); if (r) - goto error_free; + goto error_unlock; amdgpu_res_first(pages_addr ? NULL : res, offset, (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor); @@ -1018,7 +980,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, tmp = start + num_entries; r = amdgpu_vm_ptes_update(¶ms, start, tmp, addr, flags); if (r) - goto error_free; + goto error_unlock; amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE); start = tmp; @@ -1026,22 +988,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->commit(¶ms, fence); - if (flush_tlb || params.table_freed) { - tlb_cb->vm = vm; - if (fence && *fence && - !dma_fence_add_callback(*fence, &tlb_cb->cb, - amdgpu_vm_tlb_seq_cb)) { - dma_fence_put(vm->last_tlb_flush); - vm->last_tlb_flush = dma_fence_get(*fence); - } else { - amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); - } - tlb_cb = NULL; - } - -error_free: - kfree(tlb_cb); - error_unlock: amdgpu_vm_eviction_unlock(vm); drm_dev_exit(idx); @@ -2260,10 +2206,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work); INIT_KFIFO(vm->faults); - r = amdgpu_vm_init_entities(adev, vm); + r = amdgpu_seq64_alloc(adev, &vm->tlb_seq_va, &vm->tlb_seq_cpu_addr); if (r) return r; + r = amdgpu_vm_init_entities(adev, vm); + if (r) + goto error_free_seq; + vm->pte_support_ats = false; vm->is_compute_context = false; @@ -2283,7 +2233,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->last_update = dma_fence_get_stub(); vm->last_unlocked = dma_fence_get_stub(); - vm->last_tlb_flush = dma_fence_get_stub(); vm->generation = 0; mutex_init(&vm->eviction_lock); @@ -2322,10 +2271,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, amdgpu_bo_unref(&root_bo); error_free_delayed: - dma_fence_put(vm->last_tlb_flush); dma_fence_put(vm->last_unlocked); amdgpu_vm_fini_entities(vm); +error_free_seq: + amdgpu_seq64_free(adev, vm->tlb_seq_va); return r; } @@ -2441,7 +2391,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) struct amdgpu_bo_va_mapping *mapping, *tmp; bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt; struct amdgpu_bo *root; - unsigned long flags; int i; amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm); @@ -2453,11 +2402,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) amdgpu_vm_set_pasid(adev, vm, 0); dma_fence_wait(vm->last_unlocked, false); dma_fence_put(vm->last_unlocked); - dma_fence_wait(vm->last_tlb_flush, false); - /* Make sure that all fence callbacks have completed */ - spin_lock_irqsave(vm->last_tlb_flush->lock, flags); - spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags); - dma_fence_put(vm->last_tlb_flush); + amdgpu_seq64_free(adev, vm->tlb_seq_va); list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 666698a57192..ac9380afcb69 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -247,9 +247,9 @@ struct amdgpu_vm_update_params { unsigned int num_dw_left; /** - * @table_freed: return true if page table is freed when updating + * @needs_flush: true whenever we need to invalidate the TLB */ - bool table_freed; + bool needs_flush; /** * @allow_override: true for memory that is not uncached: allows MTYPE @@ -328,9 +328,11 @@ struct amdgpu_vm { struct drm_sched_entity immediate; struct drm_sched_entity delayed; - /* Last finished delayed update */ + /* Sequence number indicating necessary TLB flush */ atomic64_t tlb_seq; - struct dma_fence *last_tlb_flush; + uint64_t tlb_seq_va; + uint64_t *tlb_seq_cpu_addr; + atomic64_t kfd_last_flushed_seq; /* How many times we had to re-generate the page tables */ @@ -549,22 +551,7 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm); */ static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm) { - unsigned long flags; - spinlock_t *lock; - - /* - * Workaround to stop racing between the fence signaling and handling - * the cb. The lock is static after initially setting it up, just make - * sure that the dma_fence structure isn't freed up. - */ - rcu_read_lock(); - lock = vm->last_tlb_flush->lock; - rcu_read_unlock(); - - spin_lock_irqsave(lock, flags); - spin_unlock_irqrestore(lock, flags); - - return atomic64_read(&vm->tlb_seq); + return READ_ONCE(*vm->tlb_seq_cpu_addr); } /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c index 6e31621452de..0f8fd0acef7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c @@ -108,7 +108,8 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p, static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p, struct dma_fence **fence) { - /* Flush HDP */ + if (p->needs_flush) + *p->vm->tlb_seq_cpu_addr = atomic64_inc_return(&p->vm->tlb_seq); mb(); amdgpu_device_flush_hdp(p->adev, NULL); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a160265ddc07..95dc0afdaffb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -1056,7 +1056,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, while (cursor.pfn < frag_start) { /* Make sure previous mapping is freed */ if (cursor.entry->bo) { - params->table_freed = true; + params->needs_flush = true; amdgpu_vm_pt_free_dfs(adev, params->vm, &cursor, params->unlocked); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 349416e176a1..91cc3121fd4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -126,6 +126,11 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, WARN_ON(ib->length_dw == 0); amdgpu_ring_pad_ib(ring, ib); + if (p->needs_flush) { + p->job->uf_addr = amdgpu_seq64_gpu_addr(p->adev, + p->vm->tlb_seq_va); + p->job->uf_sequence = atomic64_inc_return(&p->vm->tlb_seq); + } WARN_ON(ib->length_dw > p->num_dw_left); f = amdgpu_job_submit(p->job); -- 2.43.2