On Mon, 2021-07-19 at 13:51 +0200, Christian König wrote: > This way we finally fix the problem that new resource are > not immediately evict-able after allocation. > > That has caused numerous problems including OOM on GDS handling > and not being able to use TTM as general resource manager. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 +- > drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +- > drivers/gpu/drm/ttm/ttm_bo.c | 101 ++----------------- > drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - > drivers/gpu/drm/ttm/ttm_device.c | 4 +- > drivers/gpu/drm/ttm/ttm_resource.c | 127 > ++++++++++++++++++++++++ > include/drm/ttm/ttm_bo_api.h | 16 --- > include/drm/ttm/ttm_bo_driver.h | 29 +----- > include/drm/ttm/ttm_resource.h | 35 +++++++ > 9 files changed, 177 insertions(+), 146 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 18246b5b6ee3..4b178a74b4e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -643,12 +643,12 @@ void amdgpu_vm_move_to_lru_tail(struct > amdgpu_device *adev, > > if (vm->bulk_moveable) { > spin_lock(&adev->mman.bdev.lru_lock); > - ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); > + ttm_lru_bulk_move_tail(&vm->lru_bulk_move); > spin_unlock(&adev->mman.bdev.lru_lock); > return; > } > > - memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); > + ttm_lru_bulk_move_init(&vm->lru_bulk_move); > > spin_lock(&adev->mman.bdev.lru_lock); > list_for_each_entry(bo_base, &vm->idle, vm_status) { > @@ -658,11 +658,9 @@ void amdgpu_vm_move_to_lru_tail(struct > amdgpu_device *adev, > if (!bo->parent) > continue; > > - ttm_bo_move_to_lru_tail(&bo->tbo, bo->tbo.resource, > - &vm->lru_bulk_move); > + ttm_bo_move_to_lru_tail(&bo->tbo, &vm- > >lru_bulk_move); > if (shadow) > ttm_bo_move_to_lru_tail(&shadow->tbo, > - shadow->tbo.resource, > &vm->lru_bulk_move); > } > spin_unlock(&adev->mman.bdev.lru_lock); > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > index bf33724bed5c..b38eef37f1c8 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c > @@ -472,7 +472,7 @@ static void i915_ttm_adjust_lru(struct > drm_i915_gem_object *obj) > bo->priority = I915_TTM_PRIO_NO_PAGES; > } > > - ttm_bo_move_to_lru_tail(bo, bo->resource, NULL); > + ttm_bo_move_to_lru_tail(bo, NULL); > spin_unlock(&bo->bdev->lru_lock); > } > > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c > b/drivers/gpu/drm/ttm/ttm_bo.c > index 5a2dc712c632..09a62ad06b9d 100644 > --- a/drivers/gpu/drm/ttm/ttm_bo.c > +++ b/drivers/gpu/drm/ttm/ttm_bo.c > @@ -69,95 +69,15 @@ static void ttm_bo_mem_space_debug(struct > ttm_buffer_object *bo, > } > } > > -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo) > -{ > - struct ttm_device *bdev = bo->bdev; > - > - list_del_init(&bo->lru); > - > - if (bdev->funcs->del_from_lru_notify) > - bdev->funcs->del_from_lru_notify(bo); > -} > - > -static void ttm_bo_bulk_move_set_pos(struct ttm_lru_bulk_move_pos > *pos, > - struct ttm_buffer_object *bo) > -{ > - if (!pos->first) > - pos->first = bo; > - pos->last = bo; > -} > - > void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo, > - struct ttm_resource *mem, > struct ttm_lru_bulk_move *bulk) > { > - struct ttm_device *bdev = bo->bdev; > - struct ttm_resource_manager *man; > - > - if (!bo->deleted) > - dma_resv_assert_held(bo->base.resv); > - > - if (bo->pin_count) { > - ttm_bo_del_from_lru(bo); > - return; > - } > - > - man = ttm_manager_type(bdev, mem->mem_type); > - list_move_tail(&bo->lru, &man->lru[bo->priority]); > - > - if (bdev->funcs->del_from_lru_notify) > - bdev->funcs->del_from_lru_notify(bo); > - > - if (bulk && !bo->pin_count) { > - switch (bo->resource->mem_type) { > - case TTM_PL_TT: > - ttm_bo_bulk_move_set_pos(&bulk->tt[bo- > >priority], bo); > - break; > + dma_resv_assert_held(bo->base.resv); > > - case TTM_PL_VRAM: > - ttm_bo_bulk_move_set_pos(&bulk->vram[bo- > >priority], bo); > - break; > - } > - } > + ttm_resource_move_to_lru_tail(bo->resource, bulk); > } > EXPORT_SYMBOL(ttm_bo_move_to_lru_tail); > > -void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk) > -{ > - unsigned i; > - > - for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { > - struct ttm_lru_bulk_move_pos *pos = &bulk->tt[i]; > - struct ttm_resource_manager *man; > - > - if (!pos->first) > - continue; > - > - dma_resv_assert_held(pos->first->base.resv); > - dma_resv_assert_held(pos->last->base.resv); > - > - man = ttm_manager_type(pos->first->bdev, TTM_PL_TT); > - list_bulk_move_tail(&man->lru[i], &pos->first->lru, > - &pos->last->lru); > - } > - > - for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { > - struct ttm_lru_bulk_move_pos *pos = &bulk->vram[i]; > - struct ttm_resource_manager *man; > - > - if (!pos->first) > - continue; > - > - dma_resv_assert_held(pos->first->base.resv); > - dma_resv_assert_held(pos->last->base.resv); > - > - man = ttm_manager_type(pos->first->bdev, > TTM_PL_VRAM); > - list_bulk_move_tail(&man->lru[i], &pos->first->lru, > - &pos->last->lru); > - } > -} > -EXPORT_SYMBOL(ttm_bo_bulk_move_lru_tail); > - > static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo, > struct ttm_resource *mem, bool > evict, > struct ttm_operation_ctx *ctx, > @@ -339,7 +259,6 @@ static int ttm_bo_cleanup_refs(struct > ttm_buffer_object *bo, > return ret; > } > > - ttm_bo_del_from_lru(bo); > list_del_init(&bo->ddestroy); > spin_unlock(&bo->bdev->lru_lock); > ttm_bo_cleanup_memtype_use(bo); > @@ -440,7 +359,7 @@ static void ttm_bo_release(struct kref *kref) > */ > if (bo->pin_count) { > bo->pin_count = 0; > - ttm_bo_move_to_lru_tail(bo, bo->resource, > NULL); > + ttm_resource_move_to_lru_tail(bo->resource, > NULL); > } > > kref_init(&bo->kref); > @@ -453,7 +372,6 @@ static void ttm_bo_release(struct kref *kref) > } > > spin_lock(&bo->bdev->lru_lock); > - ttm_bo_del_from_lru(bo); > list_del(&bo->ddestroy); > spin_unlock(&bo->bdev->lru_lock); > > @@ -667,15 +585,17 @@ int ttm_mem_evict_first(struct ttm_device > *bdev, > struct ww_acquire_ctx *ticket) > { > struct ttm_buffer_object *bo = NULL, *busy_bo = NULL; > + struct ttm_resource *res; > bool locked = false; > unsigned i; > int ret; > > spin_lock(&bdev->lru_lock); > for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { > - list_for_each_entry(bo, &man->lru[i], lru) { > + list_for_each_entry(res, &man->lru[i], lru) { > bool busy; > > + bo = res->bo; Follow up to previous review: What happens here if someone now reassigns @res->bo and then kills @bo. At least it's not immediately clear what's protecting from that. Isn't a kref_get_unless_zero() on the bo needed here, and res->bo being assigned (and properly cleared on bo destruction) under the lru_lock when needed? Admittedly as you pointed out earlier we can't kref_put() the bo under the lru lock but (if all else fails) one could perhaps defer the put to a worker, or move the bo to lru tail and drop the lru lock iff kref_put() may hit a zero refcount. /Thomas