> + dma_fence_set_error(&s_fence->finished, -ECANCELED); > + > + /* fake signaling the scheduled fence */ > + atomic_inc(&sched->hw_rq_count); > + amd_sched_fence_scheduled(s_fence); > + wake_up(&sched->job_scheduled); Unnecessary, we know that the scheduler is already awake because it is processing this function. [ml] WHAT ??? this wake_up operate on job_scheduled, like the one at the bottom of sched_main(), It is used to wake up the thread waiting in "sched_entity_fini" on this "sched->job_scheduled", I don't understand why not necessary > + guilty_context = s_job->s_fence->scheduled.context; > + } > > + skip = (found_guilty && s_job->s_fence->scheduled.context == > +guilty_context); > spin_unlock(&sched->job_list_lock); > - fence = sched->ops->run_job(s_job); > + fence = sched->ops->run_job(s_job, skip); As far as I can see you can use amd_sched_job_fake_signal() here as well, no need for the extra skip parameter to run_job(). [ML] no you still didn't get my point, in sched_main() we use fake_signal() because the job hadn't been linked in mirror list, so "job_finish()" Cannot be invoked for those jobs, But for job_recovery(), it go through mirror list and only focus on jobs from that list, thus all jobs must be handled by "job_finis()" callback, So we need let those jobs go through the run_job() again, only except that it didn't need to submit to ring really , that way the job_finish() callback Can handle that job safely and seemless BR Monk -----Original Message----- From: Christian König [mailto:ckoenig.leichtzumerken@xxxxxxxxx] Sent: 2017å¹´10æ??26æ?¥ 18:52 To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org Subject: Re: [PATCH 1/7] amd/scheduler:imple job skip feature(v2) Am 26.10.2017 um 12:30 schrieb Monk Liu: > jobs are skipped under two cases > 1)when the entity behind this job marked guilty, the job poped from > this entity's queue will be dropped in sched_main loop. > > 2)in job_recovery(), skip the scheduling job if its karma detected > above limit, and also skipped as well for other jobs sharing the same > fence context. this approach is becuase job_recovery() cannot access > job->entity due to entity may already dead. > > v2: > some logic fix > > with this feature we can introduce new gpu recover feature. > > Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +++-- > drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 54 ++++++++++++++++++++------- > drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- > 3 files changed, 47 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index a58e3c5..f08fde9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -177,7 +177,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job) > return fence; > } > > -static struct dma_fence *amdgpu_job_run(struct amd_sched_job > *sched_job) > +static struct dma_fence *amdgpu_job_run(struct amd_sched_job > +*sched_job, bool skip) > { > struct dma_fence *fence = NULL; > struct amdgpu_device *adev; > @@ -194,10 +194,11 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > > trace_amdgpu_sched_run_job(job); > - /* skip ib schedule when vram is lost */ > - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + > + if (skip || job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + /* skip ib schedule if looks needed, and set error */ > dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); > - DRM_ERROR("Skip scheduling IBs!\n"); > + DRM_INFO("Skip scheduling IBs!\n"); > } else { > r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, > &fence); > diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > index 9cbeade..995661e 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > @@ -330,6 +330,28 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity) > return false; > } > > +static void amd_sched_job_fake_signal(struct amd_sched_job *job) { > + struct amd_sched_fence *s_fence = job->s_fence; > + struct amd_gpu_scheduler *sched = job->sched; > + struct amd_sched_entity *entity = job->s_entity; > + > + dma_fence_set_error(&s_fence->finished, -ECANCELED); > + > + /* fake signaling the scheduled fence */ > + atomic_inc(&sched->hw_rq_count); > + amd_sched_fence_scheduled(s_fence); > + wake_up(&sched->job_scheduled); Unnecessary, we know that the scheduler is already awake because it is processing this function. > + > + /* fake signaling the finished fence */ > + job->s_entity = NULL; > + spsc_queue_pop(&entity->job_queue); That code isn't up to date any more, Andrey removed job->s_entity yesterday. Please rebase on amd-staging-drm-next before resending. > + > + amd_sched_process_job(NULL, &s_fence->cb); > + dma_fence_put(&s_fence->finished); > + sched->ops->free_job(job); > +} > + > static struct amd_sched_job * > amd_sched_entity_pop_job(struct amd_sched_entity *entity) > { > @@ -344,6 +366,11 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) > if (amd_sched_entity_add_dependency_cb(entity)) > return NULL; > > + if (entity->guilty && atomic_read(entity->guilty)) { > + amd_sched_job_fake_signal(sched_job); > + return NULL; > + } > + Better move this after spsc_queue_pop below and remove the extra spsc_queue_pop from amd_sched_job_fake_signal. > sched_job->s_entity = NULL; > spsc_queue_pop(&entity->job_queue); > return sched_job; > @@ -441,13 +468,6 @@ static void amd_sched_job_timedout(struct work_struct *work) > job->sched->ops->timedout_job(job); > } > > -static void amd_sched_set_guilty(struct amd_sched_job *s_job) -{ > - if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) > - if (s_job->s_entity->guilty) > - atomic_set(s_job->s_entity->guilty, 1); > -} > - > void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) > { > struct amd_sched_job *s_job; > @@ -466,7 +486,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo > } > spin_unlock(&sched->job_list_lock); > > - if (bad) { > + if (bad && atomic_inc_return(&bad->karma) > bad->sched->hang_limit) > +{ > bool found = false; > > for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) > { @@ -476,7 +496,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo > list_for_each_entry_safe(entity, tmp, &rq->entities, list) { > if (bad->s_fence->scheduled.context == entity->fence_context) { > found = true; > - amd_sched_set_guilty(bad); > + if (entity->guilty) > + atomic_set(entity->guilty, 1); > break; > } > } > @@ -499,6 +520,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) > void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > { > struct amd_sched_job *s_job, *tmp; > + bool found_guilty = false; > int r; > > spin_lock(&sched->job_list_lock); > @@ -510,9 +532,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > struct amd_sched_fence *s_fence = s_job->s_fence; > struct dma_fence *fence; > + bool skip; > + uint64_t guilty_context; > + > + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { > + found_guilty = true; > + guilty_context = s_job->s_fence->scheduled.context; > + } > > + skip = (found_guilty && s_job->s_fence->scheduled.context == > +guilty_context); > spin_unlock(&sched->job_list_lock); > - fence = sched->ops->run_job(s_job); > + fence = sched->ops->run_job(s_job, skip); As far as I can see you can use amd_sched_job_fake_signal() here as well, no need for the extra skip parameter to run_job(). Regards, Christian. > atomic_inc(&sched->hw_rq_count); > if (fence) { > s_fence->parent = dma_fence_get(fence); @@ -525,7 +555,6 @@ void > amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } > spin_lock(&sched->job_list_lock); > @@ -650,7 +679,7 @@ static int amd_sched_main(void *param) > atomic_inc(&sched->hw_rq_count); > amd_sched_job_begin(sched_job); > > - fence = sched->ops->run_job(sched_job); > + fence = sched->ops->run_job(sched_job, false); > amd_sched_fence_scheduled(s_fence); > > if (fence) { > @@ -664,7 +693,6 @@ static int amd_sched_main(void *param) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } > > diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > index f9e3a83..a8e5a7d 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > @@ -126,7 +126,7 @@ static inline bool amd_sched_invalidate_job(struct amd_sched_job *s_job, int thr > */ > struct amd_sched_backend_ops { > struct dma_fence *(*dependency)(struct amd_sched_job *sched_job); > - struct dma_fence *(*run_job)(struct amd_sched_job *sched_job); > + struct dma_fence *(*run_job)(struct amd_sched_job *sched_job, bool > +skip); > void (*timedout_job)(struct amd_sched_job *sched_job); > void (*free_job)(struct amd_sched_job *sched_job); > };