On 8/27/2024 7:42 PM, Christian König wrote: > This was only used as workaround for recovering the page tables after > VRAM was lost and is no longer necessary after the function > amdgpu_vm_bo_reset_state_machine() started to do the same. > > Compute never used shadows either, so the only proplematic case left is > SVM and that is most likely not recoverable in any way when VRAM is > lost. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> This patch works fine on GC 9.4.3 SOCs. Acked-by: Lijo Lazar <lijo.lazar@xxxxxxx> Alex or someone else may take a closer look. Thanks, Lijo > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 - > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 87 +-------------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 67 +--------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 21 ----- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 ---- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 56 +------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 19 +---- > 7 files changed, 6 insertions(+), 265 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e8c284aea1f2..e2cf77a93a0f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1082,10 +1082,6 @@ struct amdgpu_device { > > struct amdgpu_virt virt; > > - /* link all shadow bo */ > - struct list_head shadow_list; > - struct mutex shadow_list_lock; > - > /* record hw reset is performed */ > bool has_hw_reset; > u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index da06705f0026..33a939571f89 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4107,9 +4107,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, > spin_lock_init(&adev->mm_stats.lock); > spin_lock_init(&adev->wb.lock); > > - INIT_LIST_HEAD(&adev->shadow_list); > - mutex_init(&adev->shadow_list_lock); > - > INIT_LIST_HEAD(&adev->reset_list); > > INIT_LIST_HEAD(&adev->ras_list); > @@ -5029,80 +5026,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) > return 0; > } > > -/** > - * amdgpu_device_recover_vram - Recover some VRAM contents > - * > - * @adev: amdgpu_device pointer > - * > - * Restores the contents of VRAM buffers from the shadows in GTT. Used to > - * restore things like GPUVM page tables after a GPU reset where > - * the contents of VRAM might be lost. > - * > - * Returns: > - * 0 on success, negative error code on failure. > - */ > -static int amdgpu_device_recover_vram(struct amdgpu_device *adev) > -{ > - struct dma_fence *fence = NULL, *next = NULL; > - struct amdgpu_bo *shadow; > - struct amdgpu_bo_vm *vmbo; > - long r = 1, tmo; > - > - if (amdgpu_sriov_runtime(adev)) > - tmo = msecs_to_jiffies(8000); > - else > - tmo = msecs_to_jiffies(100); > - > - dev_info(adev->dev, "recover vram bo from shadow start\n"); > - mutex_lock(&adev->shadow_list_lock); > - list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { > - /* If vm is compute context or adev is APU, shadow will be NULL */ > - if (!vmbo->shadow) > - continue; > - shadow = vmbo->shadow; > - > - /* No need to recover an evicted BO */ > - if (!shadow->tbo.resource || > - shadow->tbo.resource->mem_type != TTM_PL_TT || > - shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || > - shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) > - continue; > - > - r = amdgpu_bo_restore_shadow(shadow, &next); > - if (r) > - break; > - > - if (fence) { > - tmo = dma_fence_wait_timeout(fence, false, tmo); > - dma_fence_put(fence); > - fence = next; > - if (tmo == 0) { > - r = -ETIMEDOUT; > - break; > - } else if (tmo < 0) { > - r = tmo; > - break; > - } > - } else { > - fence = next; > - } > - } > - mutex_unlock(&adev->shadow_list_lock); > - > - if (fence) > - tmo = dma_fence_wait_timeout(fence, false, tmo); > - dma_fence_put(fence); > - > - if (r < 0 || tmo <= 0) { > - dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); > - return -EIO; > - } > - > - dev_info(adev->dev, "recover vram bo from shadow done\n"); > - return 0; > -} > - > - > /** > * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf > * > @@ -5165,12 +5088,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > if (r) > return r; > > - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { > + if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) > amdgpu_inc_vram_lost(adev); > - r = amdgpu_device_recover_vram(adev); > - } > - if (r) > - return r; > > /* need to be called during full access so we can't do it later like > * bare-metal does. > @@ -5569,9 +5488,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > } > } > > - if (!r) > - r = amdgpu_device_recover_vram(tmp_adev); > - else > + if (r) > tmp_adev->asic_reset_res = r; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index ff1a69243699..cc9506fb0cc7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -76,24 +76,6 @@ static void amdgpu_bo_user_destroy(struct ttm_buffer_object *tbo) > amdgpu_bo_destroy(tbo); > } > > -static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo) > -{ > - struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev); > - struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo; > - struct amdgpu_bo_vm *vmbo; > - > - bo = shadow_bo->parent; > - vmbo = to_amdgpu_bo_vm(bo); > - /* in case amdgpu_device_recover_vram got NULL of bo->parent */ > - if (!list_empty(&vmbo->shadow_list)) { > - mutex_lock(&adev->shadow_list_lock); > - list_del_init(&vmbo->shadow_list); > - mutex_unlock(&adev->shadow_list_lock); > - } > - > - amdgpu_bo_destroy(tbo); > -} > - > /** > * amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo > * @bo: buffer object to be checked > @@ -107,8 +89,7 @@ static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo) > bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) > { > if (bo->destroy == &amdgpu_bo_destroy || > - bo->destroy == &amdgpu_bo_user_destroy || > - bo->destroy == &amdgpu_bo_vm_destroy) > + bo->destroy == &amdgpu_bo_user_destroy) > return true; > > return false; > @@ -718,52 +699,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev, > return r; > } > > -/** > - * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list > - * > - * @vmbo: BO that will be inserted into the shadow list > - * > - * Insert a BO to the shadow list. > - */ > -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo) > -{ > - struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev); > - > - mutex_lock(&adev->shadow_list_lock); > - list_add_tail(&vmbo->shadow_list, &adev->shadow_list); > - vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo); > - vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy; > - mutex_unlock(&adev->shadow_list_lock); > -} > - > -/** > - * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow > - * > - * @shadow: &amdgpu_bo shadow to be restored > - * @fence: dma_fence associated with the operation > - * > - * Copies a buffer object's shadow content back to the object. > - * This is used for recovering a buffer from its shadow in case of a gpu > - * reset where vram context may be lost. > - * > - * Returns: > - * 0 for success or a negative error code on failure. > - */ > -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence) > - > -{ > - struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev); > - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > - uint64_t shadow_addr, parent_addr; > - > - shadow_addr = amdgpu_bo_gpu_offset(shadow); > - parent_addr = amdgpu_bo_gpu_offset(shadow->parent); > - > - return amdgpu_copy_buffer(ring, shadow_addr, parent_addr, > - amdgpu_bo_size(shadow), NULL, fence, > - true, false, 0); > -} > - > /** > * amdgpu_bo_kmap - map an &amdgpu_bo buffer object > * @bo: &amdgpu_bo buffer object to be mapped > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > index 44ffd9a03dce..717e47b46167 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h > @@ -136,8 +136,6 @@ struct amdgpu_bo_user { > > struct amdgpu_bo_vm { > struct amdgpu_bo bo; > - struct amdgpu_bo *shadow; > - struct list_head shadow_list; > struct amdgpu_vm_bo_base entries[]; > }; > > @@ -275,22 +273,6 @@ static inline bool amdgpu_bo_encrypted(struct amdgpu_bo *bo) > return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED; > } > > -/** > - * amdgpu_bo_shadowed - check if the BO is shadowed > - * > - * @bo: BO to be tested. > - * > - * Returns: > - * NULL if not shadowed or else return a BO pointer. > - */ > -static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo) > -{ > - if (bo->tbo.type == ttm_bo_type_kernel) > - return to_amdgpu_bo_vm(bo)->shadow; > - > - return NULL; > -} > - > bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); > void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain); > > @@ -347,9 +329,6 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo); > u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo); > void amdgpu_bo_get_memory(struct amdgpu_bo *bo, > struct amdgpu_mem_stats *stats); > -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo); > -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, > - struct dma_fence **fence); > uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, > uint32_t domain); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index ad2e469548c9..3464a7a880f0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -465,7 +465,6 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, > { > uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); > struct amdgpu_vm_bo_base *bo_base; > - struct amdgpu_bo *shadow; > struct amdgpu_bo *bo; > int r; > > @@ -486,16 +485,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, > spin_unlock(&vm->status_lock); > > bo = bo_base->bo; > - shadow = amdgpu_bo_shadowed(bo); > > r = validate(param, bo); > if (r) > return r; > - if (shadow) { > - r = validate(param, shadow); > - if (r) > - return r; > - } > > if (bo->tbo.type != ttm_bo_type_kernel) { > amdgpu_vm_bo_moved(bo_base); > @@ -2129,10 +2122,6 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, > { > struct amdgpu_vm_bo_base *bo_base; > > - /* shadow bo doesn't have bo base, its validation needs its parent */ > - if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) > - bo = bo->parent; > - > for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { > struct amdgpu_vm *vm = bo_base->vm; > > @@ -2466,7 +2455,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > root_bo = amdgpu_bo_ref(&root->bo); > r = amdgpu_bo_reserve(root_bo, true); > if (r) { > - amdgpu_bo_unref(&root->shadow); > amdgpu_bo_unref(&root_bo); > goto error_free_delayed; > } > @@ -2558,11 +2546,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) > vm->last_update = dma_fence_get_stub(); > vm->is_compute_context = true; > > - /* Free the shadow bo for compute VM */ > - amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow); > - > - goto unreserve_bo; > - > unreserve_bo: > amdgpu_bo_unreserve(vm->root.bo); > return r; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > index e39d6e7643bf..c8e0b8cfd336 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > @@ -383,14 +383,6 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, > if (r) > return r; > > - if (vmbo->shadow) { > - struct amdgpu_bo *shadow = vmbo->shadow; > - > - r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx); > - if (r) > - return r; > - } > - > if (!drm_dev_enter(adev_to_drm(adev), &idx)) > return -ENODEV; > > @@ -448,10 +440,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, > int32_t xcp_id) > { > struct amdgpu_bo_param bp; > - struct amdgpu_bo *bo; > - struct dma_resv *resv; > unsigned int num_entries; > - int r; > > memset(&bp, 0, sizeof(bp)); > > @@ -484,42 +473,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, > if (vm->root.bo) > bp.resv = vm->root.bo->tbo.base.resv; > > - r = amdgpu_bo_create_vm(adev, &bp, vmbo); > - if (r) > - return r; > - > - bo = &(*vmbo)->bo; > - if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) { > - (*vmbo)->shadow = NULL; > - return 0; > - } > - > - if (!bp.resv) > - WARN_ON(dma_resv_lock(bo->tbo.base.resv, > - NULL)); > - resv = bp.resv; > - memset(&bp, 0, sizeof(bp)); > - bp.size = amdgpu_vm_pt_size(adev, level); > - bp.domain = AMDGPU_GEM_DOMAIN_GTT; > - bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC; > - bp.type = ttm_bo_type_kernel; > - bp.resv = bo->tbo.base.resv; > - bp.bo_ptr_size = sizeof(struct amdgpu_bo); > - bp.xcp_id_plus1 = xcp_id + 1; > - > - r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); > - > - if (!resv) > - dma_resv_unlock(bo->tbo.base.resv); > - > - if (r) { > - amdgpu_bo_unref(&bo); > - return r; > - } > - > - amdgpu_bo_add_to_shadow_list(*vmbo); > - > - return 0; > + return amdgpu_bo_create_vm(adev, &bp, vmbo); > } > > /** > @@ -569,7 +523,6 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, > return 0; > > error_free_pt: > - amdgpu_bo_unref(&pt->shadow); > amdgpu_bo_unref(&pt_bo); > return r; > } > @@ -581,17 +534,10 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, > */ > static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) > { > - struct amdgpu_bo *shadow; > - > if (!entry->bo) > return; > > entry->bo->vm_bo = NULL; > - shadow = amdgpu_bo_shadowed(entry->bo); > - if (shadow) { > - ttm_bo_set_bulk_move(&shadow->tbo, NULL); > - amdgpu_bo_unref(&shadow); > - } > ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); > > spin_lock(&entry->vm->status_lock); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > index 9b748d7058b5..390432a22ddd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c > @@ -35,16 +35,7 @@ > */ > static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table) > { > - int r; > - > - r = amdgpu_ttm_alloc_gart(&table->bo.tbo); > - if (r) > - return r; > - > - if (table->shadow) > - r = amdgpu_ttm_alloc_gart(&table->shadow->tbo); > - > - return r; > + return amdgpu_ttm_alloc_gart(&table->bo.tbo); > } > > /* Allocate a new job for @count PTE updates */ > @@ -273,17 +264,13 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, > > if (!p->pages_addr) { > /* set page commands needed */ > - if (vmbo->shadow) > - amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr, > - count, incr, flags); > amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count, > incr, flags); > return 0; > } > > /* copy commands needed */ > - ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw * > - (vmbo->shadow ? 2 : 1); > + ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw; > > /* for padding */ > ndw -= 7; > @@ -298,8 +285,6 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, > pte[i] |= flags; > } > > - if (vmbo->shadow) > - amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes); > amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes); > > pe += nptes * 8;