Am 13.04.2018 um 09:01 schrieb Emily Deng: > issue: > there are VMC page fault occurred if force APP kill during > 3dmark test, the cause is in entity_fini we manually signal > all those jobs in entity's queue which confuse the sync/dep > mechanism: > > 1)page fault occurred in sdma's clear job which operate on > shadow buffer, and shadow buffer's Gart table is cleaned by > ttm_bo_release since the fence in its reservation was fake signaled > by entity_fini() under the case of SIGKILL received. > > 2)page fault occurred in gfx' job because during the lifetime > of gfx job we manually fake signal all jobs from its entity > in entity_fini(), thus the unmapping/clear PTE job depend on those > result fence is satisfied and sdma start clearing the PTE and lead > to GFX page fault. > > fix: > 1)should at least wait all jobs already scheduled complete in entity_fini() > if SIGKILL is the case. > > 2)if a fence signaled and try to clear some entity's dependency, should > set this entity guilty to prevent its job really run since the dependency > is fake signaled. > > v2: > splitting drm_sched_entity_fini() into two functions: > 1)The first one is does the waiting, removes the entity from the > runqueue and returns an error when the process was killed. > 2)The second one then goes over the entity, install it as > completion signal for the remaining jobs and signals all jobs > with an error code. > > v3: > 1)Replace the fini1 and fini2 with better name > 2)Call the first part before the VM teardown in > amdgpu_driver_postclose_kms() and the second part > after the VM teardown > 3)Keep the original function drm_sched_entity_fini to > refine the code. > > v4: > 1)Rename entity->finished to entity->last_scheduled; > 2)Rename drm_sched_entity_fini_job_cb() to > drm_sched_entity_kill_jobs_cb(); > 3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT; > 4)Replace the type of entity->fini_status with "int"; > 5)Remove the check about entity->finished. > > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > Signed-off-by: Emily Deng <Emily.Deng at amd.com> At least of hand that looks really good. Patch is Reviewed-by: Christian König <christian.koenig at amd.com>. Andrey, David you guys also recently hacked on the scheduler, so can please take a look as well? Thanks, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 64 ++++++++++++++++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 5 ++- > drivers/gpu/drm/scheduler/gpu_scheduler.c | 71 ++++++++++++++++++++++++++----- > include/drm/gpu_scheduler.h | 7 +++ > 5 files changed, 128 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 5734871..b3d047d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, > int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id); > > void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr); > +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr); > +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr); > void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr); > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index 09d35051..eb80edf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, > return r; > } > > -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx) > +static void amdgpu_ctx_fini(struct kref *ref) > { > + struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount); > struct amdgpu_device *adev = ctx->adev; > unsigned i, j; > > @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx) > kfree(ctx->fences); > ctx->fences = NULL; > > - for (i = 0; i < adev->num_rings; i++) > - drm_sched_entity_fini(&adev->rings[i]->sched, > - &ctx->rings[i].entity); > - > amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); > > mutex_destroy(&ctx->lock); > + > + kfree(ctx); > } > > static int amdgpu_ctx_alloc(struct amdgpu_device *adev, > @@ -170,12 +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev, > static void amdgpu_ctx_do_release(struct kref *ref) > { > struct amdgpu_ctx *ctx; > + u32 i; > > ctx = container_of(ref, struct amdgpu_ctx, refcount); > > - amdgpu_ctx_fini(ctx); > + for (i = 0; i < ctx->adev->num_rings; i++) > + drm_sched_entity_fini(&ctx->adev->rings[i]->sched, > + &ctx->rings[i].entity); > > - kfree(ctx); > + amdgpu_ctx_fini(ref); > } > > static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) > @@ -435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr) > idr_init(&mgr->ctx_handles); > } > > +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) > +{ > + struct amdgpu_ctx *ctx; > + struct idr *idp; > + uint32_t id, i; > + > + idp = &mgr->ctx_handles; > + > + idr_for_each_entry(idp, ctx, id) { > + > + if (!ctx->adev) > + return; > + > + for (i = 0; i < ctx->adev->num_rings; i++) > + if (kref_read(&ctx->refcount) == 1) > + drm_sched_entity_do_release(&ctx->adev->rings[i]->sched, > + &ctx->rings[i].entity); > + else > + DRM_ERROR("ctx %p is still alive\n", ctx); > + } > +} > + > +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) > +{ > + struct amdgpu_ctx *ctx; > + struct idr *idp; > + uint32_t id, i; > + > + idp = &mgr->ctx_handles; > + > + idr_for_each_entry(idp, ctx, id) { > + > + if (!ctx->adev) > + return; > + > + for (i = 0; i < ctx->adev->num_rings; i++) > + if (kref_read(&ctx->refcount) == 1) > + drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched, > + &ctx->rings[i].entity); > + else > + DRM_ERROR("ctx %p is still alive\n", ctx); > + } > +} > + > void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr) > { > struct amdgpu_ctx *ctx; > struct idr *idp; > uint32_t id; > > + amdgpu_ctx_mgr_entity_cleanup(mgr); > + > idp = &mgr->ctx_handles; > > idr_for_each_entry(idp, ctx, id) { > - if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1) > + if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1) > DRM_ERROR("ctx %p is still alive\n", ctx); > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 487d39e..6cbb427 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, > return; > > pm_runtime_get_sync(dev->dev); > - > - amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr); > + amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr); > > if (adev->asic_type != CHIP_RAVEN) { > amdgpu_uvd_free_handles(adev, file_priv); > @@ -935,6 +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, > pd = amdgpu_bo_ref(fpriv->vm.root.base.bo); > > amdgpu_vm_fini(adev, &fpriv->vm); > + amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr); > + > if (pasid) > amdgpu_pasid_free_delayed(pd->tbo.resv, pasid); > amdgpu_bo_unref(&pd); > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c > index 310275e..44d2198 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c > @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, > entity->rq = rq; > entity->sched = sched; > entity->guilty = guilty; > + entity->fini_status = 0; > + entity->last_scheduled = NULL; > > spin_lock_init(&entity->rq_lock); > spin_lock_init(&entity->queue_lock); > @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity) > return true; > } > > +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, > + struct dma_fence_cb *cb) > +{ > + struct drm_sched_job *job = container_of(cb, struct drm_sched_job, > + finish_cb); > + drm_sched_fence_finished(job->s_fence); > + WARN_ON(job->s_fence->parent); > + dma_fence_put(&job->s_fence->finished); > + job->sched->ops->free_job(job); > +} > + > + > /** > * Destroy a context entity > * > * @sched Pointer to scheduler instance > * @entity The pointer to a valid scheduler entity > * > - * Cleanup and free the allocated resources. > + * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting, > + * removes the entity from the runqueue and returns an error when the process was killed. > */ > -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity) > { > - int r; > - > if (!drm_sched_entity_is_initialized(sched, entity)) > return; > /** > @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > * queued IBs or discard them on SIGKILL > */ > if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) > - r = -ERESTARTSYS; > + entity->fini_status = -ERESTARTSYS; > else > - r = wait_event_killable(sched->job_scheduled, > + entity->fini_status = wait_event_killable(sched->job_scheduled, > drm_sched_entity_is_idle(entity)); > drm_sched_entity_set_rq(entity, NULL); > - if (r) { > +} > +EXPORT_SYMBOL(drm_sched_entity_do_release); > + > +/** > + * Destroy a context entity > + * > + * @sched Pointer to scheduler instance > + * @entity The pointer to a valid scheduler entity > + * > + * The second one then goes over the entity and signals all jobs with an error code. > + */ > +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity) > +{ > + if (entity->fini_status) { > struct drm_sched_job *job; > + int r; > > /* Park the kernel for a moment to make sure it isn't processing > * our enity. > @@ -241,13 +269,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > struct drm_sched_fence *s_fence = job->s_fence; > drm_sched_fence_scheduled(s_fence); > dma_fence_set_error(&s_fence->finished, -ESRCH); > - drm_sched_fence_finished(s_fence); > - WARN_ON(s_fence->parent); > - dma_fence_put(&s_fence->finished); > - sched->ops->free_job(job); > + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, > + drm_sched_entity_kill_jobs_cb); > + if (r == -ENOENT) > + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); > + else if (r) > + DRM_ERROR("fence add callback failed (%d)\n", r); > } > + > + dma_fence_put(entity->last_scheduled); > + entity->last_scheduled = NULL; > } > } > +EXPORT_SYMBOL(drm_sched_entity_cleanup); > + > +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity) > +{ > + drm_sched_entity_do_release(sched, entity); > + drm_sched_entity_cleanup(sched, entity); > +} > EXPORT_SYMBOL(drm_sched_entity_fini); > > static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) > @@ -530,6 +571,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) > spin_unlock(&sched->job_list_lock); > fence = sched->ops->run_job(s_job); > atomic_inc(&sched->hw_rq_count); > + > + dma_fence_put(s_job->entity->last_scheduled); > + s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished); > + > if (fence) { > s_fence->parent = dma_fence_get(fence); > r = dma_fence_add_callback(fence, &s_fence->cb, > @@ -556,6 +601,7 @@ int drm_sched_job_init(struct drm_sched_job *job, > void *owner) > { > job->sched = sched; > + job->entity = entity; > job->s_priority = entity->rq - sched->sched_rq; > job->s_fence = drm_sched_fence_create(entity, owner); > if (!job->s_fence) > @@ -669,6 +715,9 @@ static int drm_sched_main(void *param) > fence = sched->ops->run_job(sched_job); > drm_sched_fence_scheduled(s_fence); > > + dma_fence_put(entity->last_scheduled); > + entity->last_scheduled = dma_fence_get(&s_fence->finished); > + > if (fence) { > s_fence->parent = dma_fence_get(fence); > r = dma_fence_add_callback(fence, &s_fence->cb, > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index c053a32..350a62c 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -65,6 +65,8 @@ struct drm_sched_entity { > struct dma_fence *dependency; > struct dma_fence_cb cb; > atomic_t *guilty; /* points to ctx's guilty */ > + int fini_status; > + struct dma_fence *last_scheduled; > }; > > /** > @@ -119,6 +121,7 @@ struct drm_sched_job { > uint64_t id; > atomic_t karma; > enum drm_sched_priority s_priority; > + struct drm_sched_entity *entity; > }; > > static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, > @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity, > struct drm_sched_rq *rq, > uint32_t jobs, atomic_t *guilty); > +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity); > +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity); > void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity); > void drm_sched_entity_push_job(struct drm_sched_job *sched_job,