Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7 can we add EAGAIN as ERESTARTSYS which filter out the annoying error message . Thanks, Prike -----Original Message----- From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Sent: Thursday, May 23, 2019 7:04 PM To: Zhou, David(ChunMing) <David1.Zhou@xxxxxxx>; Olsak, Marek <Marek.Olsak@xxxxxxx>; Zhou, David(ChunMing) <David1.Zhou@xxxxxxx>; Liang, Prike <Prike.Liang@xxxxxxx>; dri-devel@xxxxxxxxxxxxxxxxxxxxx; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 [CAUTION: External Email] Am 23.05.19 um 12:24 schrieb zhoucm1: > > > On 2019年05月22日 20:59, Christian König wrote: >> [CAUTION: External Email] >> >> BOs on the LRU might be blocked during command submission and cause >> OOM situations. >> >> Avoid this by blocking for the first busy BO not locked by the same >> ticket as the BO we are searching space for. >> >> v10: completely start over with the patch since we didn't >> handled a whole bunch of corner cases. >> >> Signed-off-by: Christian König <christian.koenig@xxxxxxx> >> --- >> drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------ >> 1 file changed, 66 insertions(+), 11 deletions(-) >> >> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c >> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4 >> 100644 >> --- a/drivers/gpu/drm/ttm/ttm_bo.c >> +++ b/drivers/gpu/drm/ttm/ttm_bo.c >> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable); >> * b. Otherwise, trylock it. >> */ >> static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object >> *bo, >> - struct ttm_operation_ctx *ctx, bool *locked) >> + struct ttm_operation_ctx *ctx, bool *locked, >> bool *busy) >> { >> bool ret = false; >> >> - *locked = false; >> if (bo->resv == ctx->resv) { >> reservation_object_assert_held(bo->resv); >> if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT >> || !list_empty(&bo->ddestroy)) >> ret = true; >> + *locked = false; >> + if (busy) >> + *busy = false; >> } else { >> - *locked = reservation_object_trylock(bo->resv); >> - ret = *locked; >> + ret = reservation_object_trylock(bo->resv); >> + *locked = ret; >> + if (busy) >> + *busy = !ret; >> } >> >> return ret; >> } >> >> +/** >> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available >> + * >> + * @busy_bo: BO which couldn't be locked with trylock >> + * @ctx: operation context >> + * @ticket: acquire ticket >> + * >> + * Try to lock a busy buffer object to avoid failing eviction. >> + */ >> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo, >> + struct ttm_operation_ctx *ctx, >> + struct ww_acquire_ctx *ticket) { >> + int r; >> + >> + if (!busy_bo || !ticket) >> + return -EBUSY; >> + >> + if (ctx->interruptible) >> + r = >> + reservation_object_lock_interruptible(busy_bo->resv, >> + ticket); >> + else >> + r = reservation_object_lock(busy_bo->resv, ticket); >> + >> + /* >> + * TODO: It would be better to keep the BO locked until >> allocation is at >> + * least tried one more time, but that would mean a much >> larger rework >> + * of TTM. >> + */ >> + if (!r) >> + reservation_object_unlock(busy_bo->resv); >> + >> + return r == -EDEADLK ? -EAGAIN : r; } >> + >> static int ttm_mem_evict_first(struct ttm_bo_device *bdev, >> uint32_t mem_type, >> const struct ttm_place *place, >> - struct ttm_operation_ctx *ctx) >> + struct ttm_operation_ctx *ctx, >> + struct ww_acquire_ctx *ticket) >> { >> + struct ttm_buffer_object *bo = NULL, *busy_bo = NULL; >> struct ttm_bo_global *glob = bdev->glob; >> struct ttm_mem_type_manager *man = &bdev->man[mem_type]; >> - struct ttm_buffer_object *bo = NULL; >> bool locked = false; >> unsigned i; >> int ret; >> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct >> ttm_bo_device *bdev, >> spin_lock(&glob->lru_lock); >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { >> list_for_each_entry(bo, &man->lru[i], lru) { >> - if (!ttm_bo_evict_swapout_allowable(bo, ctx, >> &locked)) >> + bool busy; >> + >> + if (!ttm_bo_evict_swapout_allowable(bo, ctx, >> &locked, >> + &busy)) { >> + if (busy && !busy_bo && >> + bo->resv->lock.ctx != ticket) >> + busy_bo = bo; >> continue; >> + } >> >> if (place && >> !bdev->driver->eviction_valuable(bo, >> place)) { >> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct >> ttm_bo_device *bdev, >> } >> >> if (!bo) { >> + if (busy_bo) >> + ttm_bo_get(busy_bo); >> spin_unlock(&glob->lru_lock); >> - return -EBUSY; >> + ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket); > If you rely on EAGAIN, why do you still try to lock busy_bo? any > negative effect if directly return EAGAIN without tring lock? Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked. When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling. Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well. Christian. > > -David >> + if (busy_bo) >> + ttm_bo_put(busy_bo); >> + return ret; >> } >> >> kref_get(&bo->list_kref); >> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct >> ttm_buffer_object *bo, >> return ret; >> if (mem->mm_node) >> break; >> - ret = ttm_mem_evict_first(bdev, mem->mem_type, place, >> ctx); >> + ret = ttm_mem_evict_first(bdev, mem->mem_type, place, >> ctx, >> + bo->resv->lock.ctx); >> if (unlikely(ret != 0)) >> return ret; >> } while (1); >> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct >> ttm_bo_device *bdev, >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { >> while (!list_empty(&man->lru[i])) { >> spin_unlock(&glob->lru_lock); >> - ret = ttm_mem_evict_first(bdev, mem_type, >> NULL, &ctx); >> + ret = ttm_mem_evict_first(bdev, mem_type, >> NULL, &ctx, >> + NULL); >> if (ret) >> return ret; >> spin_lock(&glob->lru_lock); @@ -1797,7 >> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct >> ttm_operation_ctx *ctx) >> spin_lock(&glob->lru_lock); >> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { >> list_for_each_entry(bo, &glob->swap_lru[i], swap) { >> - if (ttm_bo_evict_swapout_allowable(bo, ctx, >> &locked)) { >> + if (ttm_bo_evict_swapout_allowable(bo, ctx, >> &locked, >> + NULL)) { >> ret = 0; >> break; >> } >> -- >> 2.17.1 >> > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx