On 2016å¹´06æ??29æ?¥ 17:03, Christian König wrote: > Am 29.06.2016 um 10:09 schrieb Chunming Zhou: >> Change-Id: I8e554d34c9e477ea255e0ed2a936397aa5f665e7 >> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23 >> ++++++++++++++--------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 22 ++++++++++++++++++++++ >> 5 files changed, 42 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 163429c8..03b0fe7 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -763,6 +763,7 @@ void amdgpu_job_free(struct amdgpu_job *job); >> int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring >> *ring, >> struct amd_sched_entity *entity, void *owner, >> struct fence **f); >> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched); >> struct amdgpu_ring { >> struct amdgpu_device *adev; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 5c4691c..2c8e7f4 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -1994,13 +1994,14 @@ retry: >> } >> /* restore scratch */ >> amdgpu_atombios_scratch_regs_restore(adev); >> - if (0) { >> + if (!r) { >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> struct amdgpu_ring *ring = adev->rings[i]; >> if (!ring) >> continue; >> + amdgpu_job_recovery(&ring->sched); >> kthread_unpark(ring->sched.thread); >> - amdgpu_ring_restore(ring, ring_sizes[i], ring_data[i]); >> + kfree(ring_data[i]); >> ring_sizes[i] = 0; >> ring_data[i] = NULL; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> index 72bf9f8..8af9903 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> @@ -366,9 +366,9 @@ int amdgpu_fence_driver_init_ring(struct >> amdgpu_ring *ring, >> setup_timer(&ring->fence_drv.fallback_timer, >> amdgpu_fence_fallback, >> (unsigned long)ring); >> - ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1; >> + ring->fence_drv.num_fences_mask = num_hw_submission * 4 - 1; >> spin_lock_init(&ring->fence_drv.lock); >> - ring->fence_drv.fences = kcalloc(num_hw_submission * 2, >> sizeof(void *), >> + ring->fence_drv.fences = kcalloc(num_hw_submission * 4, >> sizeof(void *), >> GFP_KERNEL); >> if (!ring->fence_drv.fences) >> return -ENOMEM; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> index 34e3542..702bd9b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c >> @@ -193,14 +193,19 @@ int amdgpu_ib_schedule(struct amdgpu_ring >> *ring, unsigned num_ibs, >> if (ring->funcs->emit_hdp_invalidate) >> amdgpu_ring_emit_hdp_invalidate(ring); >> - r = amdgpu_fence_emit(ring, &hwf); >> - if (r) { >> - dev_err(adev->dev, "failed to emit fence (%d)\n", r); >> - if (job && job->vm_id) >> - amdgpu_vm_reset_id(adev, job->vm_id); >> - amdgpu_ring_undo(ring); >> - return r; >> - } >> + if (!job || !job->fence) { >> + r = amdgpu_fence_emit(ring, &hwf); >> + if (r) { >> + dev_err(adev->dev, "failed to emit fence (%d)\n", r); >> + if (job && job->vm_id) >> + amdgpu_vm_reset_id(adev, job->vm_id); >> + amdgpu_ring_undo(ring); >> + return r; >> + } >> + } else >> + /* re-submit fence when gpu reset */ >> + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, >> + job->fence->seqno, AMDGPU_FENCE_FLAG_INT); > > You shouldn't resubmit the old hardware fence here, but rather > allocate a new fence number. This will introduce new problem and let things complicate, E.g. old hw fence f1, f2 in hang engine, that means sched_f1, sched_f2 is callback from f1,f2, if allocating new fence number when re-submitting them, will have new f3,f4. When f3 is signalled, then that means f1,f2, sched_f1,sched_f2 are signalled, then userspace think cs of sched_f2 is completed, but not in fact, which is f4,sched_f4. I don't know why we shouldn't/couldn't the old hw fence. > >> /* wrap the last IB with fence */ >> if (job && job->uf_bo) { >> @@ -212,7 +217,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, >> unsigned num_ibs, >> } >> if (f) >> - *f = fence_get(hwf); >> + *f = (job && job->fence) ? job->fence : fence_get(hwf); >> if (patch_offset != ~0 && ring->funcs->patch_cond_exec) >> amdgpu_ring_patch_cond_exec(ring, patch_offset); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> index 83771c1..32fad1c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> @@ -39,6 +39,28 @@ static void amdgpu_job_timedout(struct >> amd_sched_job *s_job) >> amdgpu_gpu_reset(job->adev); >> } >> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched) >> +{ >> + struct amd_sched_job *s_job, *tmp; >> + >> + spin_lock(&sched->job_list_lock); >> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, >> node) { >> + struct amdgpu_job *job = to_amdgpu_job(s_job); >> + if (job->vm) { >> + struct amdgpu_vm_id *id = >> &job->adev->vm_manager.ids[job->vm_id]; >> + job->vm_pd_addr = >> amdgpu_bo_gpu_offset(job->vm->page_directory); >> + id->pd_gpu_addr = job->vm_pd_addr; > > That is clearly not a good idea, the page directory could be moving > somewhere else when this is happening already. Hmm, how about directly to use id->pd_gpu_addr? since pd BO must be there, id->pd_gpu_addr is saved that. > >> + } >> + sched->ops->run_job(s_job); >> + } >> + s_job = list_first_entry_or_null(&sched->ring_mirror_list, >> + struct amd_sched_job, node); >> + if (s_job) >> + schedule_delayed_work(&s_job->work_tdr, sched->timeout); >> + >> + spin_unlock(&sched->job_list_lock); >> +} > > The whole function belongs into the scheduler. if vmid is processed, this indeed be sched function. Thanks, David Zhou > > Christian. > >> + >> int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, >> struct amdgpu_job **job, struct amdgpu_vm *vm) >> { >