On Thu, Mar 21, 2024 at 10:37 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > Am 21.03.24 um 15:12 schrieb Tvrtko Ursulin: > > > > On 21/03/2024 12:43, Christian König wrote: > >> This reverts drm/amdgpu: fix ftrace event amdgpu_bo_move always move > >> on same heap. The basic problem here is that after the move the old > >> location is simply not available any more. > >> > >> Some fixes where suggested, but essentially we should call the move > >> notification before actually moving things because only this way we have > >> the correct order for DMA-buf and VM move notifications as well. > >> > >> Also rework the statistic handling so that we don't update the eviction > >> counter before the move. > >> > >> Signed-off-by: Christian König <christian.koenig@xxxxxxx> > > > > Don't forget: > > > > Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move > > always move on same heap") > > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171 > > Ah, thanks. I already wanted to ask if there is any bug report about > that as well. Did this ever land? I don't see it anywhere. Alex > > Regards, > Christian. > > > > > ;) > > > > Regards, > > > > Tvrtko > > > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 15 +++---- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 4 +- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 48 ++++++++++++---------- > >> 3 files changed, 37 insertions(+), 30 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > >> index 425cebcc5cbf..eb7d824763b9 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > >> @@ -1245,19 +1245,20 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo > >> *bo, void *buffer, > >> * amdgpu_bo_move_notify - notification about a memory move > >> * @bo: pointer to a buffer object > >> * @evict: if this move is evicting the buffer from the graphics > >> address space > >> + * @new_mem: new resource for backing the BO > >> * > >> * Marks the corresponding &amdgpu_bo buffer object as invalid, > >> also performs > >> * bookkeeping. > >> * TTM driver callback which is called when ttm moves a buffer. > >> */ > >> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict) > >> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, > >> + bool evict, > >> + struct ttm_resource *new_mem) > >> { > >> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > >> + struct ttm_resource *old_mem = bo->resource; > >> struct amdgpu_bo *abo; > >> - if (!amdgpu_bo_is_amdgpu_bo(bo)) > >> - return; > >> - > >> abo = ttm_to_amdgpu_bo(bo); > >> amdgpu_vm_bo_invalidate(adev, abo, evict); > >> @@ -1267,9 +1268,9 @@ void amdgpu_bo_move_notify(struct > >> ttm_buffer_object *bo, bool evict) > >> bo->resource->mem_type != TTM_PL_SYSTEM) > >> dma_buf_move_notify(abo->tbo.base.dma_buf); > >> - /* remember the eviction */ > >> - if (evict) > >> - atomic64_inc(&adev->num_evictions); > >> + /* move_notify is called before move happens */ > >> + trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1, > >> + old_mem ? old_mem->mem_type : -1); > >> } > >> void amdgpu_bo_get_memory(struct amdgpu_bo *bo, > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > >> index a3ea8a82db23..d28e21baef16 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > >> @@ -344,7 +344,9 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo, > >> void *metadata, > >> int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer, > >> size_t buffer_size, uint32_t *metadata_size, > >> uint64_t *flags); > >> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict); > >> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, > >> + bool evict, > >> + struct ttm_resource *new_mem); > >> void amdgpu_bo_release_notify(struct ttm_buffer_object *bo); > >> vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object > >> *bo); > >> void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence, > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > >> index a5ceec7820cf..460b23918bfc 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > >> @@ -471,14 +471,16 @@ static int amdgpu_bo_move(struct > >> ttm_buffer_object *bo, bool evict, > >> if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && > >> bo->ttm == NULL)) { > >> + amdgpu_bo_move_notify(bo, evict, new_mem); > >> ttm_bo_move_null(bo, new_mem); > >> - goto out; > >> + return 0; > >> } > >> if (old_mem->mem_type == TTM_PL_SYSTEM && > >> (new_mem->mem_type == TTM_PL_TT || > >> new_mem->mem_type == AMDGPU_PL_PREEMPT)) { > >> + amdgpu_bo_move_notify(bo, evict, new_mem); > >> ttm_bo_move_null(bo, new_mem); > >> - goto out; > >> + return 0; > >> } > >> if ((old_mem->mem_type == TTM_PL_TT || > >> old_mem->mem_type == AMDGPU_PL_PREEMPT) && > >> @@ -488,9 +490,10 @@ static int amdgpu_bo_move(struct > >> ttm_buffer_object *bo, bool evict, > >> return r; > >> amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm); > >> + amdgpu_bo_move_notify(bo, evict, new_mem); > >> ttm_resource_free(bo, &bo->resource); > >> ttm_bo_assign_mem(bo, new_mem); > >> - goto out; > >> + return 0; > >> } > >> if (old_mem->mem_type == AMDGPU_PL_GDS || > >> @@ -502,8 +505,9 @@ static int amdgpu_bo_move(struct > >> ttm_buffer_object *bo, bool evict, > >> new_mem->mem_type == AMDGPU_PL_OA || > >> new_mem->mem_type == AMDGPU_PL_DOORBELL) { > >> /* Nothing to save here */ > >> + amdgpu_bo_move_notify(bo, evict, new_mem); > >> ttm_bo_move_null(bo, new_mem); > >> - goto out; > >> + return 0; > >> } > >> if (bo->type == ttm_bo_type_device && > >> @@ -515,22 +519,23 @@ static int amdgpu_bo_move(struct > >> ttm_buffer_object *bo, bool evict, > >> abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; > >> } > >> - if (adev->mman.buffer_funcs_enabled) { > >> - if (((old_mem->mem_type == TTM_PL_SYSTEM && > >> - new_mem->mem_type == TTM_PL_VRAM) || > >> - (old_mem->mem_type == TTM_PL_VRAM && > >> - new_mem->mem_type == TTM_PL_SYSTEM))) { > >> - hop->fpfn = 0; > >> - hop->lpfn = 0; > >> - hop->mem_type = TTM_PL_TT; > >> - hop->flags = TTM_PL_FLAG_TEMPORARY; > >> - return -EMULTIHOP; > >> - } > >> + if (adev->mman.buffer_funcs_enabled && > >> + ((old_mem->mem_type == TTM_PL_SYSTEM && > >> + new_mem->mem_type == TTM_PL_VRAM) || > >> + (old_mem->mem_type == TTM_PL_VRAM && > >> + new_mem->mem_type == TTM_PL_SYSTEM))) { > >> + hop->fpfn = 0; > >> + hop->lpfn = 0; > >> + hop->mem_type = TTM_PL_TT; > >> + hop->flags = TTM_PL_FLAG_TEMPORARY; > >> + return -EMULTIHOP; > >> + } > >> + amdgpu_bo_move_notify(bo, evict, new_mem); > >> + if (adev->mman.buffer_funcs_enabled) > >> r = amdgpu_move_blit(bo, evict, new_mem, old_mem); > >> - } else { > >> + else > >> r = -ENODEV; > >> - } > >> if (r) { > >> /* Check that all memory is CPU accessible */ > >> @@ -545,11 +550,10 @@ static int amdgpu_bo_move(struct > >> ttm_buffer_object *bo, bool evict, > >> return r; > >> } > >> - trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type); > >> -out: > >> - /* update statistics */ > >> + /* update statistics after the move */ > >> + if (evict) > >> + atomic64_inc(&adev->num_evictions); > >> atomic64_add(bo->base.size, &adev->num_bytes_moved); > >> - amdgpu_bo_move_notify(bo, evict); > >> return 0; > >> } > >> @@ -1551,7 +1555,7 @@ static int amdgpu_ttm_access_memory(struct > >> ttm_buffer_object *bo, > >> static void > >> amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo) > >> { > >> - amdgpu_bo_move_notify(bo, false); > >> + amdgpu_bo_move_notify(bo, false, NULL); > >> } > >> static struct ttm_device_funcs amdgpu_bo_driver = { >