jobs are skipped under two cases 1)when the entity behind this job marked guilty, the job poped from this entity's queue will be dropped in sched_main loop. 2)in job_recovery(), skip the scheduling job if its karma detected above limit, and also skipped as well for other jobs sharing the same fence context. this approach is becuase job_recovery() cannot access job->entity due to entity may already dead. with this feature we can introduce new gpu recover feature. Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +++--- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 46 +++++++++++++++++++-------- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index a58e3c5..f08fde9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -177,7 +177,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job) return fence; } -static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) +static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job, bool skip) { struct dma_fence *fence = NULL; struct amdgpu_device *adev; @@ -194,10 +194,11 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); trace_amdgpu_sched_run_job(job); - /* skip ib schedule when vram is lost */ - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { + + if (skip || job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { + /* skip ib schedule if looks needed, and set error */ dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); - DRM_ERROR("Skip scheduling IBs!\n"); + DRM_INFO("Skip scheduling IBs!\n"); } else { r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence); diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 9cbeade..29841ba 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -330,6 +330,20 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity) return false; } +static void amd_sched_job_fake_signal(struct amd_sched_job *job) +{ + struct amd_sched_fence *s_fence = job->s_fence; + + dma_fence_set_error(&job->s_fence->finished, -ECANCELED); + /* fake signaling the scheduled fence */ + amd_sched_fence_scheduled(s_fence); + /* fake signaling the finished fence */ + dma_fence_put(&job->s_fence->finished); + job->sched->ops->free_job(job); + /* call CB on the s_fence, e.g. wake up WAIT_CS */ + amd_sched_fence_finished(s_fence); +} + static struct amd_sched_job * amd_sched_entity_pop_job(struct amd_sched_entity *entity) { @@ -345,6 +359,12 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) return NULL; sched_job->s_entity = NULL; + + if (entity->guilty && atomic_read(entity->guilty)) { + amd_sched_job_fake_signal(sched_job); + sched_job = NULL; + } + spsc_queue_pop(&entity->job_queue); return sched_job; } @@ -441,13 +461,6 @@ static void amd_sched_job_timedout(struct work_struct *work) job->sched->ops->timedout_job(job); } -static void amd_sched_set_guilty(struct amd_sched_job *s_job) -{ - if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) - if (s_job->s_entity->guilty) - atomic_set(s_job->s_entity->guilty, 1); -} - void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) { struct amd_sched_job *s_job; @@ -466,7 +479,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo } spin_unlock(&sched->job_list_lock); - if (bad) { + if (bad && atomic_inc_return(&bad->karma) > bad->sched->hang_limit) { bool found = false; for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { @@ -476,7 +489,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo list_for_each_entry_safe(entity, tmp, &rq->entities, list) { if (bad->s_fence->scheduled.context == entity->fence_context) { found = true; - amd_sched_set_guilty(bad); + if (entity->guilty) + atomic_set(entity->guilty, 1); break; } } @@ -510,9 +524,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct amd_sched_fence *s_fence = s_job->s_fence; struct dma_fence *fence; + bool found_guilty = false, skip; + uint64_t guilty_context; + + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { + found_guilty = true; + guilty_context = s_job->s_fence->scheduled.context; + } + skip = (found_guilty && s_job->s_fence->scheduled.context == guilty_context); spin_unlock(&sched->job_list_lock); - fence = sched->ops->run_job(s_job); + fence = sched->ops->run_job(s_job, skip); atomic_inc(&sched->hw_rq_count); if (fence) { s_fence->parent = dma_fence_get(fence); @@ -525,7 +547,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } spin_lock(&sched->job_list_lock); @@ -650,7 +671,7 @@ static int amd_sched_main(void *param) atomic_inc(&sched->hw_rq_count); amd_sched_job_begin(sched_job); - fence = sched->ops->run_job(sched_job); + fence = sched->ops->run_job(sched_job, false); amd_sched_fence_scheduled(s_fence); if (fence) { @@ -664,7 +685,6 @@ static int amd_sched_main(void *param) r); dma_fence_put(fence); } else { - DRM_ERROR("Failed to run job!\n"); amd_sched_process_job(NULL, &s_fence->cb); } diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index f9e3a83..a8e5a7d 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -126,7 +126,7 @@ static inline bool amd_sched_invalidate_job(struct amd_sched_job *s_job, int thr */ struct amd_sched_backend_ops { struct dma_fence *(*dependency)(struct amd_sched_job *sched_job); - struct dma_fence *(*run_job)(struct amd_sched_job *sched_job); + struct dma_fence *(*run_job)(struct amd_sched_job *sched_job, bool skip); void (*timedout_job)(struct amd_sched_job *sched_job); void (*free_job)(struct amd_sched_job *sched_job); }; -- 2.7.4