Am 29.06.2016 um 10:09 schrieb Chunming Zhou: > Change-Id: I8e554d34c9e477ea255e0ed2a936397aa5f665e7 > Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23 ++++++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 22 ++++++++++++++++++++++ > 5 files changed, 42 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 163429c8..03b0fe7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -763,6 +763,7 @@ void amdgpu_job_free(struct amdgpu_job *job); > int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, > struct amd_sched_entity *entity, void *owner, > struct fence **f); > +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched); > > struct amdgpu_ring { > struct amdgpu_device *adev; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 5c4691c..2c8e7f4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1994,13 +1994,14 @@ retry: > } > /* restore scratch */ > amdgpu_atombios_scratch_regs_restore(adev); > - if (0) { > + if (!r) { > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = adev->rings[i]; > if (!ring) > continue; > + amdgpu_job_recovery(&ring->sched); > kthread_unpark(ring->sched.thread); > - amdgpu_ring_restore(ring, ring_sizes[i], ring_data[i]); > + kfree(ring_data[i]); > ring_sizes[i] = 0; > ring_data[i] = NULL; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 72bf9f8..8af9903 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -366,9 +366,9 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > setup_timer(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, > (unsigned long)ring); > > - ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1; > + ring->fence_drv.num_fences_mask = num_hw_submission * 4 - 1; > spin_lock_init(&ring->fence_drv.lock); > - ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *), > + ring->fence_drv.fences = kcalloc(num_hw_submission * 4, sizeof(void *), > GFP_KERNEL); > if (!ring->fence_drv.fences) > return -ENOMEM; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > index 34e3542..702bd9b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > @@ -193,14 +193,19 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > if (ring->funcs->emit_hdp_invalidate) > amdgpu_ring_emit_hdp_invalidate(ring); > > - r = amdgpu_fence_emit(ring, &hwf); > - if (r) { > - dev_err(adev->dev, "failed to emit fence (%d)\n", r); > - if (job && job->vm_id) > - amdgpu_vm_reset_id(adev, job->vm_id); > - amdgpu_ring_undo(ring); > - return r; > - } > + if (!job || !job->fence) { > + r = amdgpu_fence_emit(ring, &hwf); > + if (r) { > + dev_err(adev->dev, "failed to emit fence (%d)\n", r); > + if (job && job->vm_id) > + amdgpu_vm_reset_id(adev, job->vm_id); > + amdgpu_ring_undo(ring); > + return r; > + } > + } else > + /* re-submit fence when gpu reset */ > + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > + job->fence->seqno, AMDGPU_FENCE_FLAG_INT); You shouldn't resubmit the old hardware fence here, but rather allocate a new fence number. > > /* wrap the last IB with fence */ > if (job && job->uf_bo) { > @@ -212,7 +217,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > } > > if (f) > - *f = fence_get(hwf); > + *f = (job && job->fence) ? job->fence : fence_get(hwf); > > if (patch_offset != ~0 && ring->funcs->patch_cond_exec) > amdgpu_ring_patch_cond_exec(ring, patch_offset); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 83771c1..32fad1c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -39,6 +39,28 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job) > amdgpu_gpu_reset(job->adev); > } > > +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched) > +{ > + struct amd_sched_job *s_job, *tmp; > + > + spin_lock(&sched->job_list_lock); > + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > + struct amdgpu_job *job = to_amdgpu_job(s_job); > + if (job->vm) { > + struct amdgpu_vm_id *id = &job->adev->vm_manager.ids[job->vm_id]; > + job->vm_pd_addr = amdgpu_bo_gpu_offset(job->vm->page_directory); > + id->pd_gpu_addr = job->vm_pd_addr; That is clearly not a good idea, the page directory could be moving somewhere else when this is happening already. > + } > + sched->ops->run_job(s_job); > + } > + s_job = list_first_entry_or_null(&sched->ring_mirror_list, > + struct amd_sched_job, node); > + if (s_job) > + schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + > + spin_unlock(&sched->job_list_lock); > +} The whole function belongs into the scheduler. Christian. > + > int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, > struct amdgpu_job **job, struct amdgpu_vm *vm) > {