Christian Looks something is misunderstanding in this patch: > - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + > + if (skip || job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + /* skip ib schedule if looks needed, and set error */ > dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); > - DRM_ERROR("Skip scheduling IBs!\n"); > + DRM_INFO("Skip scheduling IBs!\n"); Not really needed, we could just use amd_sched_job_fake_signal() here as well. [ml] the @skip parameter is needed, and we cannot use "fake signal", because "fake signal" may already called in entity_pop_job() stage We skip job at two place, one is in entity_pop_job() if scheduler can detect the job is guilty in that time Another one is in run_job() by this @skip in job_recovery(), > @@ -510,9 +524,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > struct amd_sched_fence *s_fence = s_job->s_fence; > struct dma_fence *fence; > + bool found_guilty = false, skip; > + uint64_t guilty_context; Well if you want the variables to have an effect you should probably move them outside the loop. Otherwise the code doesn't make much sense to me. [ML] yeah, that's an error, will fix it ! > +static void amd_sched_job_fake_signal(struct amd_sched_job *job) { > + struct amd_sched_fence *s_fence = job->s_fence; > + > + dma_fence_set_error(&job->s_fence->finished, -ECANCELED); > + /* fake signaling the scheduled fence */ > + amd_sched_fence_scheduled(s_fence); > + /* fake signaling the finished fence */ > + dma_fence_put(&job->s_fence->finished); > + job->sched->ops->free_job(job); Well signaling the finished fence will free the job as well, won't it? [ML] no, fake signal is invoked before begin_job(), so the job_finish_cb callback haven't been hooked in "finished" fence, that's why I manually call "free_job()", but keep in mind that WAIT_CS may already called so we still rely on signaling the "finished" fence to wakeup The WAIT_CS, > + if (entity->guilty && atomic_read(entity->guilty)) { > + amd_sched_job_fake_signal(sched_job); > + sched_job = NULL; > + } > + This must come after the spsc_queue_pop() below, otherwise you mess up the spsc queue. [ML] yes, I also noticed that error, will fix the sequence ! -----Original Message----- From: Christian König [mailto:ckoenig.leichtzumerken@xxxxxxxxx] Sent: 2017å¹´10æ??26æ?¥ 15:06 To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org Subject: Re: [PATCH 2/9] amd/scheduler:imple job skip feature Am 25.10.2017 um 11:22 schrieb Monk Liu: > jobs are skipped under two cases > 1)when the entity behind this job marked guilty, the job poped from > this entity's queue will be dropped in sched_main loop. > > 2)in job_recovery(), skip the scheduling job if its karma detected > above limit, and also skipped as well for other jobs sharing the same > fence context. this approach is becuase job_recovery() cannot access > job->entity due to entity may already dead. > > with this feature we can introduce new gpu recover feature. Sorry for the delay, totally swamped once more at the moment. > > Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +++--- > drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 46 +++++++++++++++++++-------- > drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 2 +- > 3 files changed, 39 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index a58e3c5..f08fde9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -177,7 +177,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job) > return fence; > } > > -static struct dma_fence *amdgpu_job_run(struct amd_sched_job > *sched_job) > +static struct dma_fence *amdgpu_job_run(struct amd_sched_job > +*sched_job, bool skip) > { > struct dma_fence *fence = NULL; > struct amdgpu_device *adev; > @@ -194,10 +194,11 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > > trace_amdgpu_sched_run_job(job); > - /* skip ib schedule when vram is lost */ > - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + > + if (skip || job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { > + /* skip ib schedule if looks needed, and set error */ > dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); > - DRM_ERROR("Skip scheduling IBs!\n"); > + DRM_INFO("Skip scheduling IBs!\n"); Not really needed, we could just use amd_sched_job_fake_signal() here as well. > } else { > r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, > &fence); > diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > index 9cbeade..29841ba 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c > @@ -330,6 +330,20 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity) > return false; > } > > +static void amd_sched_job_fake_signal(struct amd_sched_job *job) { > + struct amd_sched_fence *s_fence = job->s_fence; > + > + dma_fence_set_error(&job->s_fence->finished, -ECANCELED); > + /* fake signaling the scheduled fence */ > + amd_sched_fence_scheduled(s_fence); > + /* fake signaling the finished fence */ > + dma_fence_put(&job->s_fence->finished); > + job->sched->ops->free_job(job); Well signaling the finished fence will free the job as well, won't it? > + /* call CB on the s_fence, e.g. wake up WAIT_CS */ > + amd_sched_fence_finished(s_fence); > +} > + > static struct amd_sched_job * > amd_sched_entity_pop_job(struct amd_sched_entity *entity) > { > @@ -345,6 +359,12 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) > return NULL; > > sched_job->s_entity = NULL; > + > + if (entity->guilty && atomic_read(entity->guilty)) { > + amd_sched_job_fake_signal(sched_job); > + sched_job = NULL; > + } > + This must come after the spsc_queue_pop() below, otherwise you mess up the spsc queue. > spsc_queue_pop(&entity->job_queue); > return sched_job; > } > @@ -441,13 +461,6 @@ static void amd_sched_job_timedout(struct work_struct *work) > job->sched->ops->timedout_job(job); > } > > -static void amd_sched_set_guilty(struct amd_sched_job *s_job) -{ > - if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) > - if (s_job->s_entity->guilty) > - atomic_set(s_job->s_entity->guilty, 1); > -} > - > void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) > { > struct amd_sched_job *s_job; > @@ -466,7 +479,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo > } > spin_unlock(&sched->job_list_lock); > > - if (bad) { > + if (bad && atomic_inc_return(&bad->karma) > bad->sched->hang_limit) > +{ > bool found = false; > > for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) > { @@ -476,7 +489,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo > list_for_each_entry_safe(entity, tmp, &rq->entities, list) { > if (bad->s_fence->scheduled.context == entity->fence_context) { > found = true; > - amd_sched_set_guilty(bad); > + if (entity->guilty) > + atomic_set(entity->guilty, 1); > break; > } > } > @@ -510,9 +524,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > struct amd_sched_fence *s_fence = s_job->s_fence; > struct dma_fence *fence; > + bool found_guilty = false, skip; > + uint64_t guilty_context; Well if you want the variables to have an effect you should probably move them outside the loop. Otherwise the code doesn't make much sense to me. Regards, Christian. > + > + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { > + found_guilty = true; > + guilty_context = s_job->s_fence->scheduled.context; > + } > > + skip = (found_guilty && s_job->s_fence->scheduled.context == > +guilty_context); > spin_unlock(&sched->job_list_lock); > - fence = sched->ops->run_job(s_job); > + fence = sched->ops->run_job(s_job, skip); > atomic_inc(&sched->hw_rq_count); > if (fence) { > s_fence->parent = dma_fence_get(fence); @@ -525,7 +547,6 @@ void > amd_sched_job_recovery(struct amd_gpu_scheduler *sched) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } > spin_lock(&sched->job_list_lock); > @@ -650,7 +671,7 @@ static int amd_sched_main(void *param) > atomic_inc(&sched->hw_rq_count); > amd_sched_job_begin(sched_job); > > - fence = sched->ops->run_job(sched_job); > + fence = sched->ops->run_job(sched_job, false); > amd_sched_fence_scheduled(s_fence); > > if (fence) { > @@ -664,7 +685,6 @@ static int amd_sched_main(void *param) > r); > dma_fence_put(fence); > } else { > - DRM_ERROR("Failed to run job!\n"); > amd_sched_process_job(NULL, &s_fence->cb); > } > > diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > index f9e3a83..a8e5a7d 100644 > --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h > @@ -126,7 +126,7 @@ static inline bool amd_sched_invalidate_job(struct amd_sched_job *s_job, int thr > */ > struct amd_sched_backend_ops { > struct dma_fence *(*dependency)(struct amd_sched_job *sched_job); > - struct dma_fence *(*run_job)(struct amd_sched_job *sched_job); > + struct dma_fence *(*run_job)(struct amd_sched_job *sched_job, bool > +skip); > void (*timedout_job)(struct amd_sched_job *sched_job); > void (*free_job)(struct amd_sched_job *sched_job); > };