Am 20.03.24 um 18:12 schrieb Tvrtko Ursulin:
From: Tvrtko Ursulin <tursulin@xxxxxxxxxxx>
Pipelined object migration will free up the old bo->resource, meaning
the tracepoint added in 94aeb4117343 ("drm/amdgpu: fix ftrace event
amdgpu_bo_move always move on same heap") will trigger an use after free
when it dereferences the cached old_mem.
Fix it by caching the memory type locally, which is the only thing
tracepoint wants to know about.
While at it convert the whole function to use the cached memory types for
consistency.
v2:
* Fix compilation.
I do remember that we have fixed this before. Question is really why it
shows up again.
Going to investigate.
Thanks,
Christian.
Signed-off-by: Tvrtko Ursulin <tursulin@xxxxxxxxxxx>
Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap")
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
Cc: Beyond Wang <Wang.Beyond@xxxxxxx>
Cc: Christian König <christian.koenig@xxxxxxx>
Cc: Alex Deucher <alexander.deucher@xxxxxxx>
---
Beware this is a speculative fix for now based only on source code
analysis and backtraces from 3171. It is also a bit on the churny side so
I am happy to minimize it. But most importantly, given how I don't have
any experience in amdgpu, I am looking for domain experts to either
confirm or disprove my analysis.
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 47 ++++++++++++-------------
1 file changed, 22 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 8722beba494e..81189aab5a04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -452,10 +452,11 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
struct amdgpu_device *adev;
struct amdgpu_bo *abo;
struct ttm_resource *old_mem = bo->resource;
+ uint32_t new_mem_type = new_mem->mem_type;
+ uint32_t old_mem_type;
int r;
- if (new_mem->mem_type == TTM_PL_TT ||
- new_mem->mem_type == AMDGPU_PL_PREEMPT) {
+ if (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT) {
r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem);
if (r)
return r;
@@ -464,20 +465,18 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
abo = ttm_to_amdgpu_bo(bo);
adev = amdgpu_ttm_adev(bo->bdev);
- if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
- bo->ttm == NULL)) {
+ if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM && bo->ttm == NULL)) {
ttm_bo_move_null(bo, new_mem);
goto out;
}
- if (old_mem->mem_type == TTM_PL_SYSTEM &&
- (new_mem->mem_type == TTM_PL_TT ||
- new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
+ old_mem_type = old_mem->mem_type;
+ if (old_mem_type == TTM_PL_SYSTEM &&
+ (new_mem_type == TTM_PL_TT || new_mem_type == AMDGPU_PL_PREEMPT)) {
ttm_bo_move_null(bo, new_mem);
goto out;
}
- if ((old_mem->mem_type == TTM_PL_TT ||
- old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
- new_mem->mem_type == TTM_PL_SYSTEM) {
+ if ((old_mem_type == TTM_PL_TT || old_mem_type == AMDGPU_PL_PREEMPT) &&
+ new_mem_type == TTM_PL_SYSTEM) {
r = ttm_bo_wait_ctx(bo, ctx);
if (r)
return r;
@@ -488,22 +487,22 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
goto out;
}
- if (old_mem->mem_type == AMDGPU_PL_GDS ||
- old_mem->mem_type == AMDGPU_PL_GWS ||
- old_mem->mem_type == AMDGPU_PL_OA ||
- old_mem->mem_type == AMDGPU_PL_DOORBELL ||
- new_mem->mem_type == AMDGPU_PL_GDS ||
- new_mem->mem_type == AMDGPU_PL_GWS ||
- new_mem->mem_type == AMDGPU_PL_OA ||
- new_mem->mem_type == AMDGPU_PL_DOORBELL) {
+ if (old_mem_type == AMDGPU_PL_GDS ||
+ old_mem_type == AMDGPU_PL_GWS ||
+ old_mem_type == AMDGPU_PL_OA ||
+ old_mem_type == AMDGPU_PL_DOORBELL ||
+ new_mem_type == AMDGPU_PL_GDS ||
+ new_mem_type == AMDGPU_PL_GWS ||
+ new_mem_type == AMDGPU_PL_OA ||
+ new_mem_type == AMDGPU_PL_DOORBELL) {
/* Nothing to save here */
ttm_bo_move_null(bo, new_mem);
goto out;
}
if (bo->type == ttm_bo_type_device &&
- new_mem->mem_type == TTM_PL_VRAM &&
- old_mem->mem_type != TTM_PL_VRAM) {
+ new_mem_type == TTM_PL_VRAM &&
+ old_mem_type != TTM_PL_VRAM) {
/* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
* accesses the BO after it's moved.
*/
@@ -511,10 +510,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
}
if (adev->mman.buffer_funcs_enabled) {
- if (((old_mem->mem_type == TTM_PL_SYSTEM &&
- new_mem->mem_type == TTM_PL_VRAM) ||
- (old_mem->mem_type == TTM_PL_VRAM &&
- new_mem->mem_type == TTM_PL_SYSTEM))) {
+ if (((old_mem_type == TTM_PL_SYSTEM && new_mem_type == TTM_PL_VRAM) ||
+ (old_mem_type == TTM_PL_VRAM && new_mem_type == TTM_PL_SYSTEM))) {
hop->fpfn = 0;
hop->lpfn = 0;
hop->mem_type = TTM_PL_TT;
@@ -540,7 +537,7 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
return r;
}
- trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
+ trace_amdgpu_bo_move(abo, new_mem_type, old_mem_type);
out:
/* update statistics */
atomic64_add(bo->base.size, &adev->num_bytes_moved);