Thanks Christian, will refine the code as your suggestion. Best Wishes, Emily Deng > -----Original Message----- > From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com] > Sent: Wednesday, April 04, 2018 6:02 PM > To: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org > Cc: Liu, Monk <Monk.Liu at amd.com> > Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v2) > > Am 04.04.2018 um 11:37 schrieb Emily Deng: > > issue: > > there are VMC page fault occured if force APP kill during 3dmark test, > > the cause is in entity_fini we manually signal all those jobs in > > entity's queue which confuse the sync/dep > > mechanism: > > > > 1)page fault occured in sdma's clear job which operate on shadow > > buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release > > since the fence in its reservation was fake signaled by entity_fini() > > under the case of SIGKILL received. > > > > 2)page fault occured in gfx' job because during the lifetime of gfx > > job we manually fake signal all jobs from its entity in entity_fini(), > > thus the unmapping/clear PTE job depend on those result fence is > > satisfied and sdma start clearing the PTE and lead to GFX page fault. > > > > fix: > > 1)should at least wait all jobs already scheduled complete in > > entity_fini() if SIGKILL is the case. > > > > 2)if a fence signaled and try to clear some entity's dependency, > > should set this entity guilty to prevent its job really run since the > > dependency is fake signaled. > > > > v2: > > splitting drm_sched_entity_fini() into two functions: > > 1)The first one is does the waiting, removes the entity from the > > runqueue and returns an error when the process was killed. > > 2)The second one then goes over the entity, install it as completion > > signal for the remaining jobs and signals all jobs with an error code. > > First of all this patch won't work like this. You need to call the first part > before the VM teardown in amdgpu_driver_postclose_kms() and the second > part after the VM teardown. > > Then please come up with a better name than fini1 and fini2, something like > drm_sched_entity_cleanup() or something like this. > > Then I think it is harmless to call the _cleanup() function from the > _fini() function and only change the places where it makes sense to > explicitely call _cleanup(). > > E.g. add a new function amdgpu_ctx_mgr_cleanup() which calls the > cleanup() for each still open ctx and then modify the order in > amdgpu_driver_postclose_kms(). > > Regards, > Christian. > > > > > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > > Signed-off-by: Emily Deng <Emily.Deng at amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 13 +++++- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 3 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 3 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 8 +++- > > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++- > > drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 3 +- > > drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 3 +- > > drivers/gpu/drm/scheduler/gpu_scheduler.c | 67 > +++++++++++++++++++++++++------ > > include/drm/gpu_scheduler.h | 7 +++- > > 10 files changed, 93 insertions(+), 24 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > > index 09d35051..69dadf9 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > > @@ -104,8 +104,13 @@ static int amdgpu_ctx_init(struct amdgpu_device > > *adev, > > > > failed: > > for (j = 0; j < i; j++) > > - drm_sched_entity_fini(&adev->rings[j]->sched, > > + drm_sched_entity_fini1(&adev->rings[j]->sched, > > &ctx->rings[j].entity); > > + > > + for (j = 0; j < i; j++) > > + drm_sched_entity_fini2(&adev->rings[j]->sched, > > + &ctx->rings[j].entity); > > + > > kfree(ctx->fences); > > ctx->fences = NULL; > > return r; > > @@ -126,7 +131,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx > *ctx) > > ctx->fences = NULL; > > > > for (i = 0; i < adev->num_rings; i++) > > - drm_sched_entity_fini(&adev->rings[i]->sched, > > + drm_sched_entity_fini1(&adev->rings[i]->sched, > > + &ctx->rings[i].entity); > > + > > + for (i = 0; i < adev->num_rings; i++) > > + drm_sched_entity_fini2(&adev->rings[i]->sched, > > &ctx->rings[i].entity); > > > > amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > index d2ab404..83d46bb 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > @@ -131,8 +131,10 @@ static int amdgpu_ttm_global_init(struct > amdgpu_device *adev) > > static void amdgpu_ttm_global_fini(struct amdgpu_device *adev) > > { > > if (adev->mman.mem_global_referenced) { > > - drm_sched_entity_fini(adev->mman.entity.sched, > > + drm_sched_entity_fini1(adev->mman.entity.sched, > > &adev->mman.entity); > > + drm_sched_entity_fini2(adev->mman.entity.sched, > > + &adev->mman.entity); > > mutex_destroy(&adev->mman.gtt_window_lock); > > drm_global_item_unref(&adev->mman.bo_global_ref.ref); > > drm_global_item_unref(&adev->mman.mem_global_ref); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > > index 627542b..74a6953 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c > > @@ -277,7 +277,8 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device > *adev) > > int i; > > kfree(adev->uvd.saved_bo); > > > > - drm_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity); > > + drm_sched_entity_fini1(&adev->uvd.ring.sched, &adev->uvd.entity); > > + drm_sched_entity_fini2(&adev->uvd.ring.sched, &adev->uvd.entity); > > > > amdgpu_bo_free_kernel(&adev->uvd.vcpu_bo, > > &adev->uvd.gpu_addr, > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > > index a33804b..4166d26 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c > > @@ -212,7 +212,8 @@ int amdgpu_vce_sw_fini(struct amdgpu_device > *adev) > > if (adev->vce.vcpu_bo == NULL) > > return 0; > > > > - drm_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity); > > + drm_sched_entity_fini1(&adev->vce.ring[0].sched, &adev- > >vce.entity); > > + drm_sched_entity_fini2(&adev->vce.ring[0].sched, &adev- > >vce.entity); > > > > amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev- > >vce.gpu_addr, > > (void **)&adev->vce.cpu_addr); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > > index 58e4953..268c0c3 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c > > @@ -129,9 +129,13 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device > > *adev) > > > > kfree(adev->vcn.saved_bo); > > > > - drm_sched_entity_fini(&adev->vcn.ring_dec.sched, &adev- > >vcn.entity_dec); > > + drm_sched_entity_fini1(&adev->vcn.ring_dec.sched, > > +&adev->vcn.entity_dec); > > > > - drm_sched_entity_fini(&adev->vcn.ring_enc[0].sched, &adev- > >vcn.entity_enc); > > + drm_sched_entity_fini1(&adev->vcn.ring_enc[0].sched, > > +&adev->vcn.entity_enc); > > + > > + drm_sched_entity_fini2(&adev->vcn.ring_dec.sched, > > +&adev->vcn.entity_dec); > > + > > + drm_sched_entity_fini2(&adev->vcn.ring_enc[0].sched, > > +&adev->vcn.entity_enc); > > > > amdgpu_bo_free_kernel(&adev->vcn.vcpu_bo, > > &adev->vcn.gpu_addr, > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > index 2447429..084a39d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > @@ -2456,7 +2456,8 @@ int amdgpu_vm_init(struct amdgpu_device > *adev, struct amdgpu_vm *vm, > > vm->root.base.bo = NULL; > > > > error_free_sched_entity: > > - drm_sched_entity_fini(&ring->sched, &vm->entity); > > + drm_sched_entity_fini1(&ring->sched, &vm->entity); > > + drm_sched_entity_fini2(&ring->sched, &vm->entity); > > > > return r; > > } > > @@ -2520,7 +2521,8 @@ void amdgpu_vm_fini(struct amdgpu_device > *adev, struct amdgpu_vm *vm) > > spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, > flags); > > } > > > > - drm_sched_entity_fini(vm->entity.sched, &vm->entity); > > + drm_sched_entity_fini1(vm->entity.sched, &vm->entity); > > + drm_sched_entity_fini2(vm->entity.sched, &vm->entity); > > > > if (!RB_EMPTY_ROOT(&vm->va.rb_root)) { > > dev_err(adev->dev, "still active bo inside vm\n"); diff --git > > a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c > > b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c > > index f26f515..3c57a6e9 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c > > @@ -469,7 +469,8 @@ static int uvd_v6_0_sw_fini(void *handle) > > return r; > > > > if (uvd_v6_0_enc_support(adev)) { > > - drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, > &adev->uvd.entity_enc); > > + drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, > &adev->uvd.entity_enc); > > + drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, > > +&adev->uvd.entity_enc); > > > > for (i = 0; i < adev->uvd.num_enc_rings; ++i) > > amdgpu_ring_fini(&adev->uvd.ring_enc[i]); > > diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c > > b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c > > index 280c082..a422acf 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c > > @@ -472,7 +472,8 @@ static int uvd_v7_0_sw_fini(void *handle) > > if (r) > > return r; > > > > - drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev- > >uvd.entity_enc); > > + drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev- > >uvd.entity_enc); > > + drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, > > +&adev->uvd.entity_enc); > > > > for (i = 0; i < adev->uvd.num_enc_rings; ++i) > > amdgpu_ring_fini(&adev->uvd.ring_enc[i]); > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c > > b/drivers/gpu/drm/scheduler/gpu_scheduler.c > > index 310275e..abeb3ff 100644 > > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c > > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c > > @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct > drm_gpu_scheduler *sched, > > entity->rq = rq; > > entity->sched = sched; > > entity->guilty = guilty; > > + entity->fini_status = 0; > > + entity->finished = NULL; > > > > spin_lock_init(&entity->rq_lock); > > spin_lock_init(&entity->queue_lock); > > @@ -197,19 +199,29 @@ static bool drm_sched_entity_is_ready(struct > drm_sched_entity *entity) > > return true; > > } > > > > +static void drm_sched_entity_fini_job_cb(struct dma_fence *f, > > + struct dma_fence_cb *cb) > > +{ > > + struct drm_sched_job *job = container_of(cb, struct drm_sched_job, > > + finish_cb); > > + drm_sched_fence_finished(job->s_fence); > > + WARN_ON(job->s_fence->parent); > > + dma_fence_put(&job->s_fence->finished); > > + job->sched->ops->free_job(job); > > +} > > + > > /** > > * Destroy a context entity > > * > > * @sched Pointer to scheduler instance > > * @entity The pointer to a valid scheduler entity > > * > > - * Cleanup and free the allocated resources. > > + * Splitting drm_sched_entity_fini() into two functions, The first > > + one is does the waiting, > > + * removes the entity from the runqueue and returns an error when the > process was killed. > > */ > > -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched, > > struct drm_sched_entity *entity) > > { > > - int r; > > - > > if (!drm_sched_entity_is_initialized(sched, entity)) > > return; > > /** > > @@ -217,13 +229,27 @@ void drm_sched_entity_fini(struct > drm_gpu_scheduler *sched, > > * queued IBs or discard them on SIGKILL > > */ > > if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) > > - r = -ERESTARTSYS; > > + entity->fini_status = -ERESTARTSYS; > > else > > - r = wait_event_killable(sched->job_scheduled, > > + entity->fini_status = wait_event_killable(sched- > >job_scheduled, > > drm_sched_entity_is_idle(entity)); > > drm_sched_entity_set_rq(entity, NULL); > > - if (r) { > > +} > > + > > +/** > > + * Destroy a context entity > > + * > > + * @sched Pointer to scheduler instance > > + * @entity The pointer to a valid scheduler entity > > + * > > + * The second one then goes over the entity and signals all jobs with an > error code. > > + */ > > +void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched, > > + struct drm_sched_entity *entity) { > > + if (entity->fini_status) { > > struct drm_sched_job *job; > > + int r; > > > > /* Park the kernel for a moment to make sure it isn't > processing > > * our enity. > > @@ -241,14 +267,21 @@ void drm_sched_entity_fini(struct > drm_gpu_scheduler *sched, > > struct drm_sched_fence *s_fence = job->s_fence; > > drm_sched_fence_scheduled(s_fence); > > dma_fence_set_error(&s_fence->finished, -ESRCH); > > - drm_sched_fence_finished(s_fence); > > - WARN_ON(s_fence->parent); > > - dma_fence_put(&s_fence->finished); > > - sched->ops->free_job(job); > > + r = dma_fence_add_callback(entity->finished, &job->finish_cb, > > + drm_sched_entity_fini_job_cb); > > + if (r == -ENOENT) > > + drm_sched_entity_fini_job_cb(entity->finished, &job->finish_cb); > > + else if (r) > > + DRM_ERROR("fence add callback failed (%d)\n", r); > > + } > > + if(entity->finished){ > > + dma_fence_put(entity->finished); > > + entity->finished = NULL; > > } > > } > > } > > -EXPORT_SYMBOL(drm_sched_entity_fini); > > +EXPORT_SYMBOL(drm_sched_entity_fini1); > > +EXPORT_SYMBOL(drm_sched_entity_fini2); > > > > static void drm_sched_entity_wakeup(struct dma_fence *f, struct > dma_fence_cb *cb) > > { > > @@ -530,6 +563,11 @@ void drm_sched_job_recovery(struct > drm_gpu_scheduler *sched) > > spin_unlock(&sched->job_list_lock); > > fence = sched->ops->run_job(s_job); > > atomic_inc(&sched->hw_rq_count); > > + > > + if(s_job->entity->finished) > > + dma_fence_put(s_job->entity->finished); > > + s_job->entity->finished = dma_fence_get(&s_fence- > >finished); > > + > > if (fence) { > > s_fence->parent = dma_fence_get(fence); > > r = dma_fence_add_callback(fence, &s_fence->cb, > @@ -556,6 +594,7 > > @@ int drm_sched_job_init(struct drm_sched_job *job, > > void *owner) > > { > > job->sched = sched; > > + job->entity = entity; > > job->s_priority = entity->rq - sched->sched_rq; > > job->s_fence = drm_sched_fence_create(entity, owner); > > if (!job->s_fence) > > @@ -669,6 +708,10 @@ static int drm_sched_main(void *param) > > fence = sched->ops->run_job(sched_job); > > drm_sched_fence_scheduled(s_fence); > > > > + if(entity->finished) > > + dma_fence_put(entity->finished); > > + entity->finished = dma_fence_get(&s_fence->finished); > > + > > if (fence) { > > s_fence->parent = dma_fence_get(fence); > > r = dma_fence_add_callback(fence, &s_fence->cb, > diff --git > > a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index > > dfd54fb..e0a5a53 100644 > > --- a/include/drm/gpu_scheduler.h > > +++ b/include/drm/gpu_scheduler.h > > @@ -63,6 +63,8 @@ struct drm_sched_entity { > > struct dma_fence *dependency; > > struct dma_fence_cb cb; > > atomic_t *guilty; /* points to ctx's guilty */ > > + uint32_t fini_status; > > + struct dma_fence *finished; > > }; > > > > /** > > @@ -99,6 +101,7 @@ struct drm_sched_job { > > uint64_t id; > > atomic_t karma; > > enum drm_sched_priority s_priority; > > + struct drm_sched_entity *entity; > > }; > > > > static inline bool drm_sched_invalidate_job(struct drm_sched_job > > *s_job, @@ -148,7 +151,9 @@ int drm_sched_entity_init(struct > drm_gpu_scheduler *sched, > > struct drm_sched_entity *entity, > > struct drm_sched_rq *rq, > > uint32_t jobs, atomic_t *guilty); -void > > drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched, > > + struct drm_sched_entity *entity); void > > +drm_sched_entity_fini2(struct drm_gpu_scheduler *sched, > > struct drm_sched_entity *entity); > > void drm_sched_entity_push_job(struct drm_sched_job *sched_job, > > struct drm_sched_entity *entity);