On Tue, Mar 16, 2021 at 1:03 PM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > Am 16.03.21 um 10:35 schrieb Daniel Vetter: > > On Mon, Mar 15, 2021 at 05:04:22PM +0100, Christian König wrote: > >> Instead of having a global lock. > >> > >> Signed-off-by: Christian König <christian.koenig@xxxxxxx> > > I guess per zone lru lock is a lot more work since then we need to handle > > ordering and ABBA deadlocks? s/zone/mem region/ I think is the proper ttm > > lingo. > > Making the LRU per resource manager is the long term goal, yes. > > My key idea so far is that we make bo->mem a pointer and then move the > LRU handling into the resource object instead of the BO. > > The resource object then just references the BO and so that we can > figure out which BO to evict or which fence to wait for to free up a > resource. Hm yeah that could work out fairly nicely. Both from locking but also refcounting pov. And maybe we could then use entirely free-standing mem objects instead of ghost objects? Since that's a part of ttm I don't grok and it always looks a bit like a hack to me. So for these ghost mem objects you'd only need the lru + dma_fence_wait (can grab a fence ref under the lru and then drop lru lock for that) for eviction, no dma_resv_lock. -Daniel > > Regards, > Christian. > > > -Daniel > > > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 ++--- > >> drivers/gpu/drm/qxl/qxl_release.c | 5 +-- > >> drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++-------------- > >> drivers/gpu/drm/ttm/ttm_device.c | 12 +++---- > >> drivers/gpu/drm/ttm/ttm_execbuf_util.c | 8 ++--- > >> drivers/gpu/drm/ttm/ttm_resource.c | 9 +++-- > >> include/drm/ttm/ttm_bo_driver.h | 4 +-- > >> include/drm/ttm/ttm_device.h | 4 +-- > >> 8 files changed, 43 insertions(+), 56 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > >> index 9d19078246c8..ae18c0e32347 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > >> @@ -638,15 +638,15 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, > >> struct amdgpu_vm_bo_base *bo_base; > >> > >> if (vm->bulk_moveable) { > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&adev->mman.bdev.lru_lock); > >> ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&adev->mman.bdev.lru_lock); > >> return; > >> } > >> > >> memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&adev->mman.bdev.lru_lock); > >> list_for_each_entry(bo_base, &vm->idle, vm_status) { > >> struct amdgpu_bo *bo = bo_base->bo; > >> > >> @@ -660,7 +660,7 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, > >> &bo->shadow->tbo.mem, > >> &vm->lru_bulk_move); > >> } > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&adev->mman.bdev.lru_lock); > >> > >> vm->bulk_moveable = true; > >> } > >> diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c > >> index f5845c96d414..b19f2f00b215 100644 > >> --- a/drivers/gpu/drm/qxl/qxl_release.c > >> +++ b/drivers/gpu/drm/qxl/qxl_release.c > >> @@ -426,16 +426,13 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) > >> release->id | 0xf0000000, release->base.seqno); > >> trace_dma_fence_emit(&release->base); > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> - > >> list_for_each_entry(entry, &release->bos, head) { > >> bo = entry->bo; > >> > >> dma_resv_add_shared_fence(bo->base.resv, &release->base); > >> - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); > >> + ttm_bo_move_to_lru_tail_unlocked(bo); > >> dma_resv_unlock(bo->base.resv); > >> } > >> - spin_unlock(&ttm_glob.lru_lock); > >> ww_acquire_fini(&release->ticket); > >> } > >> > >> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c > >> index a1be88be357b..a8103c8718a3 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_bo.c > >> +++ b/drivers/gpu/drm/ttm/ttm_bo.c > >> @@ -242,9 +242,9 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo) > >> * reference it any more. The only tricky case is the trylock on > >> * the resv object while holding the lru_lock. > >> */ > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> bo->base.resv = &bo->base._resv; > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> } > >> > >> return r; > >> @@ -303,7 +303,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, > >> > >> if (unlock_resv) > >> dma_resv_unlock(bo->base.resv); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> > >> lret = dma_resv_wait_timeout_rcu(resv, true, interruptible, > >> 30 * HZ); > >> @@ -313,7 +313,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, > >> else if (lret == 0) > >> return -EBUSY; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> if (unlock_resv && !dma_resv_trylock(bo->base.resv)) { > >> /* > >> * We raced, and lost, someone else holds the reservation now, > >> @@ -323,7 +323,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, > >> * delayed destruction would succeed, so just return success > >> * here. > >> */ > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> return 0; > >> } > >> ret = 0; > >> @@ -332,13 +332,13 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, > >> if (ret || unlikely(list_empty(&bo->ddestroy))) { > >> if (unlock_resv) > >> dma_resv_unlock(bo->base.resv); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> return ret; > >> } > >> > >> ttm_bo_del_from_lru(bo); > >> list_del_init(&bo->ddestroy); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> ttm_bo_cleanup_memtype_use(bo); > >> > >> if (unlock_resv) > >> @@ -355,13 +355,12 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, > >> */ > >> bool ttm_bo_delayed_delete(struct ttm_device *bdev, bool remove_all) > >> { > >> - struct ttm_global *glob = &ttm_glob; > >> struct list_head removed; > >> bool empty; > >> > >> INIT_LIST_HEAD(&removed); > >> > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> while (!list_empty(&bdev->ddestroy)) { > >> struct ttm_buffer_object *bo; > >> > >> @@ -372,24 +371,24 @@ bool ttm_bo_delayed_delete(struct ttm_device *bdev, bool remove_all) > >> continue; > >> > >> if (remove_all || bo->base.resv != &bo->base._resv) { > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> dma_resv_lock(bo->base.resv, NULL); > >> > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> ttm_bo_cleanup_refs(bo, false, !remove_all, true); > >> > >> } else if (dma_resv_trylock(bo->base.resv)) { > >> ttm_bo_cleanup_refs(bo, false, !remove_all, true); > >> } else { > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> } > >> > >> ttm_bo_put(bo); > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> } > >> list_splice_tail(&removed, &bdev->ddestroy); > >> empty = list_empty(&bdev->ddestroy); > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> > >> return empty; > >> } > >> @@ -424,7 +423,7 @@ static void ttm_bo_release(struct kref *kref) > >> ttm_bo_flush_all_fences(bo); > >> bo->deleted = true; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> > >> /* > >> * Make pinned bos immediately available to > >> @@ -438,17 +437,17 @@ static void ttm_bo_release(struct kref *kref) > >> > >> kref_init(&bo->kref); > >> list_add_tail(&bo->ddestroy, &bdev->ddestroy); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> > >> schedule_delayed_work(&bdev->wq, > >> ((HZ / 100) < 1) ? 1 : HZ / 100); > >> return; > >> } > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> ttm_bo_del_from_lru(bo); > >> list_del(&bo->ddestroy); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> > >> ttm_bo_cleanup_memtype_use(bo); > >> dma_resv_unlock(bo->base.resv); > >> @@ -622,7 +621,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, > >> unsigned i; > >> int ret; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { > >> list_for_each_entry(bo, &man->lru[i], lru) { > >> bool busy; > >> @@ -659,7 +658,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, > >> if (!bo) { > >> if (busy_bo && !ttm_bo_get_unless_zero(busy_bo)) > >> busy_bo = NULL; > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket); > >> if (busy_bo) > >> ttm_bo_put(busy_bo); > >> @@ -673,7 +672,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, > >> return ret; > >> } > >> > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> > >> ret = ttm_bo_evict(bo, ctx); > >> if (locked) > >> @@ -773,10 +772,9 @@ static int ttm_bo_mem_placement(struct ttm_buffer_object *bo, > >> mem->mem_type = place->mem_type; > >> mem->placement = place->flags; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> ttm_bo_move_to_lru_tail(bo, mem, NULL); > >> - spin_unlock(&ttm_glob.lru_lock); > >> - > >> + spin_unlock(&bo->bdev->lru_lock); > >> return 0; > >> } > >> > >> @@ -1160,7 +1158,6 @@ EXPORT_SYMBOL(ttm_bo_wait); > >> int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, > >> gfp_t gfp_flags) > >> { > >> - struct ttm_global *glob = &ttm_glob; > >> bool locked; > >> int ret; > >> > >> @@ -1185,7 +1182,7 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, > >> > >> ttm_bo_del_from_lru(bo); > >> /* TODO: Cleanup the locking */ > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> > >> /* > >> * Move to system cached > >> diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c > >> index 2c280fb1e992..924d892109e8 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_device.c > >> +++ b/drivers/gpu/drm/ttm/ttm_device.c > >> @@ -81,7 +81,6 @@ static int ttm_global_init(void) > >> ttm_pool_mgr_init(num_pages * 50 / 100); > >> ttm_tt_mgr_init(); > >> > >> - spin_lock_init(&glob->lru_lock); > >> glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32); > >> > >> if (unlikely(glob->dummy_read_page == NULL)) { > >> @@ -125,13 +124,12 @@ EXPORT_SYMBOL(ttm_global_swapout); > >> long ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx, > >> gfp_t gfp_flags) > >> { > >> - struct ttm_global *glob = &ttm_glob; > >> struct ttm_resource_manager *man; > >> struct ttm_buffer_object *bo; > >> unsigned i, j; > >> int ret; > >> > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) { > >> man = ttm_manager_type(bdev, i); > >> if (!man || !man->use_tt) > >> @@ -156,7 +154,7 @@ long ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx, > >> } > >> } > >> } > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> return 0; > >> } > >> EXPORT_SYMBOL(ttm_device_swapout); > >> @@ -223,6 +221,7 @@ int ttm_device_init(struct ttm_device *bdev, struct ttm_device_funcs *funcs, > >> > >> bdev->vma_manager = vma_manager; > >> INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue); > >> + spin_lock_init(&bdev->lru_lock); > >> INIT_LIST_HEAD(&bdev->ddestroy); > >> bdev->dev_mapping = mapping; > >> mutex_lock(&ttm_global_mutex); > >> @@ -235,7 +234,6 @@ EXPORT_SYMBOL(ttm_device_init); > >> > >> void ttm_device_fini(struct ttm_device *bdev) > >> { > >> - struct ttm_global *glob = &ttm_glob; > >> struct ttm_resource_manager *man; > >> unsigned i; > >> > >> @@ -252,11 +250,11 @@ void ttm_device_fini(struct ttm_device *bdev) > >> if (ttm_bo_delayed_delete(bdev, true)) > >> pr_debug("Delayed destroy list was clean\n"); > >> > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) > >> if (list_empty(&man->lru[0])) > >> pr_debug("Swap list %d was clean\n", i); > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> > >> ttm_pool_fini(&bdev->pool); > >> ttm_global_release(); > >> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c > >> index 690ab97d52b7..071c48d672c6 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c > >> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c > >> @@ -51,14 +51,12 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket, > >> if (list_empty(list)) > >> return; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> list_for_each_entry(entry, list, head) { > >> struct ttm_buffer_object *bo = entry->bo; > >> > >> - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); > >> + ttm_bo_move_to_lru_tail_unlocked(bo); > >> dma_resv_unlock(bo->base.resv); > >> } > >> - spin_unlock(&ttm_glob.lru_lock); > >> > >> if (ticket) > >> ww_acquire_fini(ticket); > >> @@ -154,7 +152,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, > >> if (list_empty(list)) > >> return; > >> > >> - spin_lock(&ttm_glob.lru_lock); > >> list_for_each_entry(entry, list, head) { > >> struct ttm_buffer_object *bo = entry->bo; > >> > >> @@ -162,10 +159,9 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, > >> dma_resv_add_shared_fence(bo->base.resv, fence); > >> else > >> dma_resv_add_excl_fence(bo->base.resv, fence); > >> - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); > >> + ttm_bo_move_to_lru_tail_unlocked(bo); > >> dma_resv_unlock(bo->base.resv); > >> } > >> - spin_unlock(&ttm_glob.lru_lock); > >> if (ticket) > >> ww_acquire_fini(ticket); > >> } > >> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c > >> index ed1672a9f332..04f2eef653ab 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_resource.c > >> +++ b/drivers/gpu/drm/ttm/ttm_resource.c > >> @@ -91,7 +91,6 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, > >> .no_wait_gpu = false, > >> .force_alloc = true > >> }; > >> - struct ttm_global *glob = &ttm_glob; > >> struct dma_fence *fence; > >> int ret; > >> unsigned i; > >> @@ -100,18 +99,18 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, > >> * Can't use standard list traversal since we're unlocking. > >> */ > >> > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { > >> while (!list_empty(&man->lru[i])) { > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> ret = ttm_mem_evict_first(bdev, man, NULL, &ctx, > >> NULL); > >> if (ret) > >> return ret; > >> - spin_lock(&glob->lru_lock); > >> + spin_lock(&bdev->lru_lock); > >> } > >> } > >> - spin_unlock(&glob->lru_lock); > >> + spin_unlock(&bdev->lru_lock); > >> > >> spin_lock(&man->move_lock); > >> fence = dma_fence_get(man->move); > >> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h > >> index d007feef7676..dbccac957f8f 100644 > >> --- a/include/drm/ttm/ttm_bo_driver.h > >> +++ b/include/drm/ttm/ttm_bo_driver.h > >> @@ -180,9 +180,9 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo, > >> static inline void > >> ttm_bo_move_to_lru_tail_unlocked(struct ttm_buffer_object *bo) > >> { > >> - spin_lock(&ttm_glob.lru_lock); > >> + spin_lock(&bo->bdev->lru_lock); > >> ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); > >> - spin_unlock(&ttm_glob.lru_lock); > >> + spin_unlock(&bo->bdev->lru_lock); > >> } > >> > >> static inline void ttm_bo_assign_mem(struct ttm_buffer_object *bo, > >> diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h > >> index cda6efb4c34b..bae56d29e8ff 100644 > >> --- a/include/drm/ttm/ttm_device.h > >> +++ b/include/drm/ttm/ttm_device.h > >> @@ -56,7 +56,6 @@ extern struct ttm_global { > >> */ > >> > >> struct page *dummy_read_page; > >> - spinlock_t lru_lock; > >> > >> /** > >> * Protected by ttm_global_mutex. > >> @@ -277,8 +276,9 @@ struct ttm_device { > >> struct ttm_pool pool; > >> > >> /* > >> - * Protected by the global:lru lock. > >> + * Protection for the per manager LRU and ddestroy lists. > >> */ > >> + spinlock_t lru_lock; > >> struct list_head ddestroy; > >> > >> /* > >> -- > >> 2.25.1 > >> > >> _______________________________________________ > >> dri-devel mailing list > >> dri-devel@xxxxxxxxxxxxxxxxxxxxx > >> https://lists.freedesktop.org/mailman/listinfo/dri-devel > -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel