Am 30.10.2017 um 05:15 schrieb Monk Liu: > jobs are skipped under two cases > 1)when the entity behind this job marked guilty, the job > poped from this entity's queue will be dropped in sched_main loop. > > 2)in job_recovery(), skip the scheduling job if its karma detected > above limit, and also skipped as well for other jobs sharing the > same fence context. this approach is becuase job_recovery() cannot > access job->entity due to entity may already dead. > > v2: > some logic fix > > v3: > when entity detected guilty, don't drop the job in the poping > stage, instead set its fence error as -ECANCELED > > in run_job(), skip the scheduling either:1) fence->error < 0 > or 2) there was a VRAM LOST occurred on this job. > this way we can unify the job skipping logic. > > with this feature we can introduce new gpu recover feature. > > Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 +++++---- > drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++----------- > 2 files changed, 31 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index f60662e..0a90c76 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job, > > static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > { > - struct dma_fence *fence = NULL; > + struct dma_fence *fence = NULL, *finished; > struct amdgpu_device *adev; > struct amdgpu_job *job; > int r; > @@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > return NULL; > } > job = to_amdgpu_job(sched_job); > + finished = &job->base.s_fence->finished; > adev = job->adev; > > BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > > trace_amdgpu_sched_run_job(job); > - /* skip ib schedule when vram is lost */ > - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > - dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); > - DRM_ERROR("Skip scheduling IBs!\n"); > + > + if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) > + dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ > + > + if (finished->error < 0) { > + DRM_INFO("Skip scheduling IBs!\n"); > } else { > r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, > &fence); > diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > index 903ef8b..3d8c994 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > @@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) > if (amd_sched_entity_add_dependency_cb(entity)) > return NULL; > > + /* skip jobs from entity that marked guilty */ > + if (entity->guilty && atomic_read(entity->guilty)) > + dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); > + > spsc_queue_pop(&entity->job_queue); > return sched_job; > } > @@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work) > job->sched->ops->timedout_job(job); > } > > -static void amd_sched_set_guilty(struct amd_sched_job *s_job, > - struct amd_sched_entity *s_entity) > -{ > - if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) > - if (s_entity->guilty) > - atomic_set(s_entity->guilty, 1); > -} > - > void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) > { > struct amd_sched_job *s_job; > @@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo > spin_unlock(&sched->job_list_lock); > > if (bad) { > - bool found = false; > - > - for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { > + /* don't increase @bad's karma if it's from KERNEL RQ, > + * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) > + * corrupt but keep in mind that kernel jobs always considered good. > + */ > + for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) { > struct amd_sched_rq *rq = &sched->sched_rq[i]; > > spin_lock(&rq->lock); > list_for_each_entry_safe(entity, tmp, &rq->entities, list) { > if (bad->s_fence->scheduled.context == entity->fence_context) { > - found = true; > - amd_sched_set_guilty(bad, entity); > + if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit) > + if (entity->guilty) > + atomic_set(entity->guilty, 1); > break; > } > } > spin_unlock(&rq->lock); > - if (found) > + if (&entity->list == &rq->entities) That needs to be "&entity->list != &rq->entities", or otherwise we only check on the first round. With that fixed the patch is Reviewed-by: Christian König <christian.koenig at amd.com>. Nice work, Christian. > break; > } > } > @@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) > void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > { > struct amd_sched_job *s_job, *tmp; > + bool found_guilty = false; > int r; > > spin_lock(&sched->job_list_lock); > @@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > struct amd_sched_fence *s_fence = s_job->s_fence; > struct dma_fence *fence; > + uint64_t guilty_context; > + > + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { > + found_guilty = true; > + guilty_context = s_job->s_fence->scheduled.context; > + } > + > + if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) > + dma_fence_set_error(&s_fence->finished, -ECANCELED); > > spin_unlock(&sched->job_list_lock); > fence = sched->ops->run_job(s_job); > @@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } > spin_lock(&sched->job_list_lock); > @@ -663,7 +671,6 @@ static int amd_sched_main(void *param) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } >