Patch #1-#6 are Reviewed-by: Christian König <christian.koenig at amd.com> A few comments on this one: Am 18.08.2016 um 07:18 schrieb Chunming Zhou: > Change-Id: I963598ba6eb44bc8620d70e026c0175d1a1de120 > Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++++++++++++++++++++- > 1 file changed, 59 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index c9b15c0..c68d93b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2120,6 +2120,40 @@ bool amdgpu_need_backup(struct amdgpu_device *adev) > return amdgpu_lockup_timeout > 0 ? true : false; > } > > +static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev, > + struct amdgpu_ring *ring, > + struct amdgpu_bo *bo, > + struct fence **fence) > +{ > + struct fence *f; > + uint32_t domain; > + int r; > + > + if (!bo->shadow) > + return 0; > + > + r = amdgpu_bo_reserve(bo, false); > + if (r) > + return r; > + domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); > + /* if bo has been evicted, then no need to recover */ > + if (domain == AMDGPU_GEM_DOMAIN_VRAM) { > + r = amdgpu_bo_restore_from_shadow(adev, ring, bo, > + NULL, &f, true); > + if (r) { > + DRM_ERROR("recover page table failed!\n"); > + goto err; > + } > + if (fence) { > + fence_put(*fence); > + *fence = fence_get(f); > + } > + } > +err: > + amdgpu_bo_unreserve(bo); > + return r; > +} > + > /** > * amdgpu_gpu_reset - reset the asic > * > @@ -2202,13 +2236,37 @@ retry: > if (r) { > dev_err(adev->dev, "ib ring test failed (%d).\n", r); > r = amdgpu_suspend(adev); > + need_full_reset = true; > goto retry; > } > - > + /** > + * recovery vm page tables, since we cannot depend on VRAM is > + * consistent after gpu full reset. > + */ > + if (need_full_reset && amdgpu_need_backup(adev)) { > + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > + struct amdgpu_bo *bo, *tmp; > + struct fence *fence = NULL; > + > + DRM_INFO("recover vram bo from shadow\n"); > + mutex_lock(&adev->shadow_list_lock); > + list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { > + amdgpu_recover_vram_from_shadow(adev, ring, bo, &fence); I think we need to make sure we don't directly submit to many jobs at the same time here. E.g. do something like struct amdgpu_fence *fence = NULL, *next; amdgpu_recover_vram_from_shadow(adev, ring, bo, &next); if (fence) fence_wait(fence, false); fence_put(fence); fence = next; This way we always have a maximum of two IBs in flight. > + } > + mutex_unlock(&adev->shadow_list_lock); > + if (fence) { > + r = fence_wait(fence, true); The kernel thread executing this should never receive a signal, but just in case I suggest to not wait interruptible here. > + if (r) > + WARN(r, "recovery from shadow isn't comleted\n"); > + } > + fence_put(fence); > + } > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = adev->rings[i]; > if (!ring) > continue; > + > + DRM_INFO("ring:%d recover jobs\n", ring->idx); This change looks unrelated and should probably not be part of this patch. Regards, Christian. > amd_sched_job_recovery(&ring->sched); > kthread_unpark(ring->sched.thread); > }