This patch: - Attaches the TLB flush fence to the PT objects being freed - Adds a new ptr in VM to save this last TLB flush fence - Adds a new lock in VM to prevent out-of-context update of saved TLB flush fence - Adds a new ptr in tlb_flush structure to save vm The idea is to delay freeing of page table objects until we have the respective TLB entries flushed. V2: rebase Cc: Christian König <Christian.Koenig@xxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> Cc: Felix Kuehling <felix.kuehling@xxxxxxx> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 27 +++++++++++++++++++ .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 13 +++++++-- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 67c690044b97..b0e81c249e3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2245,6 +2245,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->generation = 0; mutex_init(&vm->eviction_lock); + mutex_init(&vm->tlb_flush_lock); vm->evicting = false; vm->tlb_fence_context = dma_fence_context_alloc(1); @@ -2360,7 +2361,9 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) } dma_fence_put(vm->last_update); + dma_fence_put(vm->tlb_fence_last); vm->last_update = dma_fence_get_stub(); + vm->tlb_fence_last = dma_fence_get_stub(); vm->is_compute_context = true; /* Free the shadow bo for compute VM */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 8e6fd25d07b7..b05bc586237f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -334,6 +334,10 @@ struct amdgpu_vm { uint64_t *tlb_seq_cpu_addr; uint64_t tlb_fence_context; + /* Ptr and lock to maintain tlb flush sync */ + struct mutex tlb_flush_lock; + struct dma_fence *tlb_fence_last; + atomic64_t kfd_last_flushed_seq; /* How many times we had to re-generate the page tables */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 95dc0afdaffb..f1c4418c4d63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -631,6 +631,18 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, return r; } +static inline +void amdgpu_vm_attach_tlb_fence(struct amdgpu_bo *bo, struct dma_fence *fence) +{ + if (!bo || !fence) + return; + + if (!dma_resv_reserve_fences(bo->tbo.base.resv, 1)) { + dma_resv_add_fence(bo->tbo.base.resv, fence, + DMA_RESV_USAGE_BOOKKEEP); + } +} + /** * amdgpu_vm_pt_free - free one PD/PT * @@ -638,6 +650,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, */ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) { + struct amdgpu_vm *vm; struct amdgpu_bo *shadow; if (!entry->bo) @@ -646,9 +659,23 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) entry->bo->vm_bo = NULL; shadow = amdgpu_bo_shadowed(entry->bo); if (shadow) { + vm = shadow->vm_bo->vm; + + mutex_lock(&vm->tlb_flush_lock); + if (vm->tlb_fence_last) + amdgpu_vm_attach_tlb_fence(shadow, vm->tlb_fence_last); + mutex_unlock(&vm->tlb_flush_lock); + ttm_bo_set_bulk_move(&shadow->tbo, NULL); amdgpu_bo_unref(&shadow); } + + vm = entry->vm; + mutex_lock(&vm->tlb_flush_lock); + if (vm->tlb_fence_last) + amdgpu_vm_attach_tlb_fence(entry->bo, vm->tlb_fence_last); + mutex_unlock(&vm->tlb_flush_lock); + ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); spin_lock(&entry->vm->status_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c index 569681badd7c..54ec81d30034 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -31,6 +31,7 @@ struct amdgpu_tlb_fence { struct dma_fence base; struct amdgpu_device *adev; + struct amdgpu_vm *vm; struct dma_fence *dependency; struct work_struct work; spinlock_t lock; @@ -51,6 +52,7 @@ static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f) static void amdgpu_tlb_fence_work(struct work_struct *work) { struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work); + struct amdgpu_vm *vm = f->vm; int r; r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0); @@ -62,6 +64,10 @@ static void amdgpu_tlb_fence_work(struct work_struct *work) dma_fence_signal(&f->base); dma_fence_put(&f->base); + + mutex_lock(&vm->tlb_flush_lock); + vm->tlb_fence_last = NULL; + mutex_unlock(&vm->tlb_flush_lock); } static const struct dma_fence_ops amdgpu_tlb_fence_ops = { @@ -92,6 +98,7 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm f->adev = adev; f->dependency = *fence; f->pasid = vm->pasid; + f->vm = vm; INIT_WORK(&f->work, amdgpu_tlb_fence_work); spin_lock_init(&f->lock); @@ -99,8 +106,10 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm vm->tlb_fence_context, atomic64_read(&vm->tlb_seq)); /* TODO: We probably need a separate wq here */ - dma_fence_get(&f->base); - schedule_work(&f->work); + mutex_lock(&vm->tlb_flush_lock); + vm->tlb_fence_last = dma_fence_get(&f->base); + mutex_unlock(&vm->tlb_flush_lock); + schedule_work(&f->work); *fence = &f->base; } -- 2.43.0