[AMD Official Use Only - General]
Please feel free to use:
Reviewed-by: Shashank Sharma <shashank.sharma@xxxxxxx>
Regards
Shashank
From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx>
Sent: Monday, February 26, 2024 3:45 PM To: Sharma, Shashank <Shashank.Sharma@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Koenig, Christian <Christian.Koenig@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@xxxxxxx> Subject: Re: [PATCH v3 1/3] drm/amdgpu: replace TLB seq callback with HW seq Am 23.02.24 um 14:42 schrieb Shashank Sharma:
> From: Christian König <christian.koenig@xxxxxxx> > > The callback we installed for the SDMA update were actually pretty > horrible. since we now have seq64 use that one and HW seq writes > instead. > > V2:(Shashank) > - rebased on amd-drm-staging-next > - changed amdgpu_seq64_gpu_addr > > Cc: Christian König <christian.koenig@xxxxxxx> > Cc: Alex Deucher <alexander.deucher@xxxxxxx> > Cc: Felix Kuehling <felix.kuehling@xxxxxxx> > Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> > Signed-off-by: Christian König <christian.koenig@xxxxxxx> Shashank can I get an rb on this patch here so that I can push it to amd-staging-drm-next? The patch was basically just to double check if the seq64 code works as intended. Thanks, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 14 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 79 ++++----------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 27 ++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 3 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 5 ++ > 7 files changed, 42 insertions(+), 89 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c > index 3d0d56087d41..300dc79fa2b9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c > @@ -199,6 +199,20 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va) > __clear_bit(bit_pos, adev->seq64.used); > } > > +/** > + * amdgpu_seq64_gpu_addr - Calculate GPU addr from va > + * > + * @adev: amdgpu_device pointer > + * @va: virtual address in client address space > + * > + * Calculate the GART address for a VA. > + */ > +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va) > +{ > + return va - amdgpu_seq64_get_va_base(adev) + > + amdgpu_bo_gpu_offset(adev->seq64.sbo); > +} > + > /** > * amdgpu_seq64_fini - Cleanup seq64 driver > * > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h > index 4203b2ab318d..63e8ac0a2057 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h > @@ -43,6 +43,7 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 gpu_addr); > int amdgpu_seq64_map(struct amdgpu_device *adev, struct amdgpu_vm *vm, > struct amdgpu_bo_va **bo_va); > void amdgpu_seq64_unmap(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv); > +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va); > > #endif > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index ed4a8c5d26d7..0960e0a665d3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -111,21 +111,6 @@ struct amdgpu_prt_cb { > struct dma_fence_cb cb; > }; > > -/** > - * struct amdgpu_vm_tlb_seq_struct - Helper to increment the TLB flush sequence > - */ > -struct amdgpu_vm_tlb_seq_struct { > - /** > - * @vm: pointer to the amdgpu_vm structure to set the fence sequence on > - */ > - struct amdgpu_vm *vm; > - > - /** > - * @cb: callback > - */ > - struct dma_fence_cb cb; > -}; > - > /** > * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping > * > @@ -862,23 +847,6 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, > return r; > } > > -/** > - * amdgpu_vm_tlb_seq_cb - make sure to increment tlb sequence > - * @fence: unused > - * @cb: the callback structure > - * > - * Increments the tlb sequence to make sure that future CS execute a VM flush. > - */ > -static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, > - struct dma_fence_cb *cb) > -{ > - struct amdgpu_vm_tlb_seq_struct *tlb_cb; > - > - tlb_cb = container_of(cb, typeof(*tlb_cb), cb); > - atomic64_inc(&tlb_cb->vm->tlb_seq); > - kfree(tlb_cb); > -} > - > /** > * amdgpu_vm_update_range - update a range in the vm page table > * > @@ -911,7 +879,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > struct dma_fence **fence) > { > struct amdgpu_vm_update_params params; > - struct amdgpu_vm_tlb_seq_struct *tlb_cb; > struct amdgpu_res_cursor cursor; > enum amdgpu_sync_mode sync_mode; > int r, idx; > @@ -919,12 +886,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > if (!drm_dev_enter(adev_to_drm(adev), &idx)) > return -ENODEV; > > - tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL); > - if (!tlb_cb) { > - r = -ENOMEM; > - goto error_unlock; > - } > - > /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache, > * heavy-weight flush TLB unconditionally. > */ > @@ -942,6 +903,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > params.immediate = immediate; > params.pages_addr = pages_addr; > params.unlocked = unlocked; > + params.needs_flush = flush_tlb; > params.allow_override = allow_override; > > /* Implicitly sync to command submissions in the same VM before > @@ -955,7 +917,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > amdgpu_vm_eviction_lock(vm); > if (vm->evicting) { > r = -EBUSY; > - goto error_free; > + goto error_unlock; > } > > if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) { > @@ -968,7 +930,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > > r = vm->update_funcs->prepare(¶ms, resv, sync_mode); > if (r) > - goto error_free; > + goto error_unlock; > > amdgpu_res_first(pages_addr ? NULL : res, offset, > (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor); > @@ -1018,7 +980,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > tmp = start + num_entries; > r = amdgpu_vm_ptes_update(¶ms, start, tmp, addr, flags); > if (r) > - goto error_free; > + goto error_unlock; > > amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE); > start = tmp; > @@ -1026,22 +988,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > > r = vm->update_funcs->commit(¶ms, fence); > > - if (flush_tlb || params.table_freed) { > - tlb_cb->vm = vm; > - if (fence && *fence && > - !dma_fence_add_callback(*fence, &tlb_cb->cb, > - amdgpu_vm_tlb_seq_cb)) { > - dma_fence_put(vm->last_tlb_flush); > - vm->last_tlb_flush = dma_fence_get(*fence); > - } else { > - amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); > - } > - tlb_cb = NULL; > - } > - > -error_free: > - kfree(tlb_cb); > - > error_unlock: > amdgpu_vm_eviction_unlock(vm); > drm_dev_exit(idx); > @@ -2260,10 +2206,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work); > INIT_KFIFO(vm->faults); > > - r = amdgpu_vm_init_entities(adev, vm); > + r = amdgpu_seq64_alloc(adev, &vm->tlb_seq_va, &vm->tlb_seq_cpu_addr); > if (r) > return r; > > + r = amdgpu_vm_init_entities(adev, vm); > + if (r) > + goto error_free_seq; > + > vm->pte_support_ats = false; > vm->is_compute_context = false; > > @@ -2283,7 +2233,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > > vm->last_update = dma_fence_get_stub(); > vm->last_unlocked = dma_fence_get_stub(); > - vm->last_tlb_flush = dma_fence_get_stub(); > vm->generation = 0; > > mutex_init(&vm->eviction_lock); > @@ -2322,10 +2271,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > amdgpu_bo_unref(&root_bo); > > error_free_delayed: > - dma_fence_put(vm->last_tlb_flush); > dma_fence_put(vm->last_unlocked); > amdgpu_vm_fini_entities(vm); > > +error_free_seq: > + amdgpu_seq64_free(adev, vm->tlb_seq_va); > return r; > } > > @@ -2441,7 +2391,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) > struct amdgpu_bo_va_mapping *mapping, *tmp; > bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt; > struct amdgpu_bo *root; > - unsigned long flags; > int i; > > amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm); > @@ -2453,11 +2402,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) > amdgpu_vm_set_pasid(adev, vm, 0); > dma_fence_wait(vm->last_unlocked, false); > dma_fence_put(vm->last_unlocked); > - dma_fence_wait(vm->last_tlb_flush, false); > - /* Make sure that all fence callbacks have completed */ > - spin_lock_irqsave(vm->last_tlb_flush->lock, flags); > - spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags); > - dma_fence_put(vm->last_tlb_flush); > + amdgpu_seq64_free(adev, vm->tlb_seq_va); > > list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { > if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 666698a57192..ac9380afcb69 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -247,9 +247,9 @@ struct amdgpu_vm_update_params { > unsigned int num_dw_left; > > /** > - * @table_freed: return true if page table is freed when updating > + * @needs_flush: true whenever we need to invalidate the TLB > */ > - bool table_freed; > + bool needs_flush; > > /** > * @allow_override: true for memory that is not uncached: allows MTYPE > @@ -328,9 +328,11 @@ struct amdgpu_vm { > struct drm_sched_entity immediate; > struct drm_sched_entity delayed; > > - /* Last finished delayed update */ > + /* Sequence number indicating necessary TLB flush */ > atomic64_t tlb_seq; > - struct dma_fence *last_tlb_flush; > + uint64_t tlb_seq_va; > + uint64_t *tlb_seq_cpu_addr; > + > atomic64_t kfd_last_flushed_seq; > > /* How many times we had to re-generate the page tables */ > @@ -549,22 +551,7 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm); > */ > static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm) > { > - unsigned long flags; > - spinlock_t *lock; > - > - /* > - * Workaround to stop racing between the fence signaling and handling > - * the cb. The lock is static after initially setting it up, just make > - * sure that the dma_fence structure isn't freed up. > - */ > - rcu_read_lock(); > - lock = vm->last_tlb_flush->lock; > - rcu_read_unlock(); > - > - spin_lock_irqsave(lock, flags); > - spin_unlock_irqrestore(lock, flags); > - > - return atomic64_read(&vm->tlb_seq); > + return READ_ONCE(*vm->tlb_seq_cpu_addr); > } > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c > index 6e31621452de..0f8fd0acef7b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c > @@ -108,7 +108,8 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p, > static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p, > struct dma_fence **fence) > { > - /* Flush HDP */ > + if (p->needs_flush) > + *p->vm->tlb_seq_cpu_addr = atomic64_inc_return(&p->vm->tlb_seq); > mb(); > amdgpu_device_flush_hdp(p->adev, NULL); > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > index a160265ddc07..95dc0afdaffb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > @@ -1056,7 +1056,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, > while (cursor.pfn < frag_start) { > /* Make sure previous mapping is freed */ > if (cursor.entry->bo) { > - params->table_freed = true; > + params->needs_flush = true; > amdgpu_vm_pt_free_dfs(adev, params->vm, > &cursor, > params->unlocked); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > index 349416e176a1..91cc3121fd4b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > @@ -126,6 +126,11 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, > > WARN_ON(ib->length_dw == 0); > amdgpu_ring_pad_ib(ring, ib); > + if (p->needs_flush) { > + p->job->uf_addr = amdgpu_seq64_gpu_addr(p->adev, > + p->vm->tlb_seq_va); > + p->job->uf_sequence = atomic64_inc_return(&p->vm->tlb_seq); > + } > WARN_ON(ib->length_dw > p->num_dw_left); > f = amdgpu_job_submit(p->job); > |