From: Tvrtko Ursulin <tursulin@xxxxxxxxxxx> Pipelined object migration will free up the old bo->resource, meaning the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap") will trigger an use after free when it dereferences the cached old_mem. Fix it by caching the memory type locally, which is the only thing tracepoint wants to know about. While at it convert the whole function to use the cached memory types for consistency. v2: * Fix compilation. Signed-off-by: Tvrtko Ursulin <tursulin@xxxxxxxxxxx> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171 Cc: Beyond Wang <Wang.Beyond@xxxxxxx> Cc: Christian König <christian.koenig@xxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> --- Beware this is a speculative fix for now based only on source code analysis and backtraces from 3171. It is also a bit on the churny side so I am happy to minimize it. But most importantly, given how I don't have any experience in amdgpu, I am looking for domain experts to either confirm or disprove my analysis. --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 8722beba494e..81189aab5a04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, struct amdgpu_device *adev; struct amdgpu_bo *abo; struct ttm_resource *old_mem = bo->resource; + uint32_t new_mem_type = new_mem->mem_type; + uint32_t old_mem_type; int r; - if (new_mem->mem_type == TTM_PL_TT || - new_mem->mem_type == AMDGPU_PL_PREEMPT) { + if (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT) { r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem); if (r) return r; @@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, abo = ttm_to_amdgpu_bo(bo); adev = amdgpu_ttm_adev(bo->bdev); - if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && - bo->ttm == NULL)) { + if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm == NULL)) { ttm_bo_move_null(bo, new_mem); goto out; } - if (old_mem->mem_type == TTM_PL_SYSTEM && - (new_mem->mem_type == TTM_PL_TT || - new_mem->mem_type == AMDGPU_PL_PREEMPT)) { + old_mem_type = old_mem->mem_type; + if (old_mem_type == TTM_PL_SYSTEM && + (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT)) { ttm_bo_move_null(bo, new_mem); goto out; } - if ((old_mem->mem_type == TTM_PL_TT || - old_mem->mem_type == AMDGPU_PL_PREEMPT) && - new_mem->mem_type == TTM_PL_SYSTEM) { + if ((old_mem_type == TTM_PL_TT || old_mem_type == AMDGPU_PL_PREEMPT) && + new_mem_type == TTM_PL_SYSTEM) { r = ttm_bo_wait_ctx(bo, ctx); if (r) return r; @@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, goto out; } - if (old_mem->mem_type == AMDGPU_PL_GDS || - old_mem->mem_type == AMDGPU_PL_GWS || - old_mem->mem_type == AMDGPU_PL_OA || - old_mem->mem_type == AMDGPU_PL_DOORBELL || - new_mem->mem_type == AMDGPU_PL_GDS || - new_mem->mem_type == AMDGPU_PL_GWS || - new_mem->mem_type == AMDGPU_PL_OA || - new_mem->mem_type == AMDGPU_PL_DOORBELL) { + if (old_mem_type == AMDGPU_PL_GDS || + old_mem_type == AMDGPU_PL_GWS || + old_mem_type == AMDGPU_PL_OA || + old_mem_type == AMDGPU_PL_DOORBELL || + new_mem_type == AMDGPU_PL_GDS || + new_mem_type == AMDGPU_PL_GWS || + new_mem_type == AMDGPU_PL_OA || + new_mem_type == AMDGPU_PL_DOORBELL) { /* Nothing to save here */ ttm_bo_move_null(bo, new_mem); goto out; } if (bo->type == ttm_bo_type_device && - new_mem->mem_type == TTM_PL_VRAM && - old_mem->mem_type != TTM_PL_VRAM) { + new_mem_type == TTM_PL_VRAM && + old_mem_type != TTM_PL_VRAM) { /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU * accesses the BO after it's moved. */ @@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, } if (adev->mman.buffer_funcs_enabled) { - if (((old_mem->mem_type == TTM_PL_SYSTEM && - new_mem->mem_type == TTM_PL_VRAM) || - (old_mem->mem_type == TTM_PL_VRAM && - new_mem->mem_type == TTM_PL_SYSTEM))) { + if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type == TTM_PL_VRAM) || + (old_mem_type == TTM_PL_VRAM && new_mem_type == TTM_PL_SYSTEM))) { hop->fpfn = 0; hop->lpfn = 0; hop->mem_type = TTM_PL_TT; @@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, return r; } - trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type); + trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type); out: /* update statistics */ atomic64_add(bo->base.size, &adev->num_bytes_moved); -- 2.44.0