How APP/UMD aware that a context is guilty or triggered too much loops of hang ?? Why APP/UMD voluntarily call amdgpu_ctx_query() to check whether gpu reset occurred or not ? Please be aware that for another CSP customer this "loose mode" is 100% welcome and wanted by they, and more important This approach won't cross X server at all, only the guilty process/context is rejected upon its submitting I don't agree that we should rely on ctx_query(), no one is responsible to call it from time to time -----Original Message----- From: Christian König [mailto:ckoenig.leichtzumerken@xxxxxxxxx] Sent: 2017å¹´10æ??9æ?¥ 16:28 To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset Am 30.09.2017 um 08:03 schrieb Monk Liu: > Change-Id: I7904f362aa0f578a5cbf5d40c7a242c2c6680a92 > Signed-off-by: Monk Liu <Monk.Liu at amd.com> NAK, if a context is guilty of a GPU reset should be determined in amdgpu_ctx_query() by looking at the fences in the ring buffer. Not when the GPU reset itself occurs. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 1 + > drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 22 ++++++++++++++++++++++ > drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 + > 5 files changed, 34 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index b40d4ba..b63e602 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -737,6 +737,7 @@ struct amdgpu_ctx { > struct dma_fence **fences; > struct amdgpu_ctx_ring rings[AMDGPU_MAX_RINGS]; > bool preamble_presented; > + bool guilty; > }; > > struct amdgpu_ctx_mgr { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 6a1515e..f92962e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -79,16 +79,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) > if (cs->in.num_chunks == 0) > return 0; > > + p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id); > + if (!p->ctx) > + return -EINVAL; > + > + if (amdgpu_sriov_vf(p->adev) && > + amdgpu_sriov_reset_level == 0 && > + p->ctx->guilty) > + return -ENODEV; > + > chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL); > if (!chunk_array) > return -ENOMEM; > > - p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id); > - if (!p->ctx) { > - ret = -EINVAL; > - goto free_chunk; > - } > - > /* get chunks */ > chunk_array_user = u64_to_user_ptr(cs->in.chunks); > if (copy_from_user(chunk_array, chunk_array_user, @@ -184,7 +187,6 > @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) > p->nchunks = 0; > put_ctx: > amdgpu_ctx_put(p->ctx); > -free_chunk: > kfree(chunk_array); > > return ret; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index 75c933b..028e9f1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -60,6 +60,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx) > rq, amdgpu_sched_jobs); > if (r) > goto failed; > + ctx->rings[i].entity.guilty = &ctx->guilty; > } > > r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git > a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > index 12c3092..89b0573 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > @@ -493,10 +493,32 @@ void amd_sched_set_queue_hang(struct amd_gpu_scheduler *sched) > void amd_sched_job_kickout(struct amd_sched_job *s_job) > { > struct amd_gpu_scheduler *sched = s_job->sched; > + struct amd_sched_entity *entity, *tmp; > + struct amd_sched_rq *rq; > + int i; > + bool found; > > spin_lock(&sched->job_list_lock); > list_del_init(&s_job->node); > spin_unlock(&sched->job_list_lock); > + > + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > + > + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++) { > + rq = &sched->sched_rq[i]; > + > + spin_lock(&rq->lock); > + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { > + if (s_job->s_entity == entity && entity->guilty) { > + *entity->guilty = true; > + found = true; > + break; > + } > + } > + spin_unlock(&rq->lock); > + if (found) > + break; > + } > } > > void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) diff > --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > index f0242aa..16c2244 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > @@ -49,6 +49,7 @@ struct amd_sched_entity { > > struct dma_fence *dependency; > struct dma_fence_cb cb; > + bool *guilty; /* this points to ctx's guilty */ > }; > > /**