[PATCH 7/7] drm/amdgpu: recover vram bo from shadow after gpu reset

deathsimple@xxxxxxxxxxx (Christian König) · Thu, 18 Aug 2016 11:46:08 +0200

Patch #1-#6 are Reviewed-by: Christian KÃ¶nig <christian.koenig at amd.com>

A few comments on this one:

Am 18.08.2016 um 07:18 schrieb Chunming Zhou:
> Change-Id: I963598ba6eb44bc8620d70e026c0175d1a1de120
> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++++++++++++++++++++-
>   1 file changed, 59 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c9b15c0..c68d93b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2120,6 +2120,40 @@ bool amdgpu_need_backup(struct amdgpu_device *adev)
>   	return amdgpu_lockup_timeout > 0 ? true : false;
>   }
>   
> +static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
> +					   struct amdgpu_ring *ring,
> +					   struct amdgpu_bo *bo,
> +					   struct fence **fence)
> +{
> +	struct fence *f;
> +	uint32_t domain;
> +	int r;
> +
> +       if (!bo->shadow)
> +               return 0;
> +
> +       r = amdgpu_bo_reserve(bo, false);
> +       if (r)
> +               return r;
> +       domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
> +       /* if bo has been evicted, then no need to recover */
> +       if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
> +               r = amdgpu_bo_restore_from_shadow(adev, ring, bo,
> +						 NULL, &f, true);
> +               if (r) {
> +                       DRM_ERROR("recover page table failed!\n");
> +                       goto err;
> +               }
> +               if (fence) {
> +                       fence_put(*fence);
> +                       *fence = fence_get(f);
> +               }
> +       }
> +err:
> +       amdgpu_bo_unreserve(bo);
> +       return r;
> +}
> +
>   /**
>    * amdgpu_gpu_reset - reset the asic
>    *
> @@ -2202,13 +2236,37 @@ retry:
>   		if (r) {
>   			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
>   			r = amdgpu_suspend(adev);
> +			need_full_reset = true;
>   			goto retry;
>   		}
> -
> +		/**
> +		 * recovery vm page tables, since we cannot depend on VRAM is
> +		 * consistent after gpu full reset.
> +		 */
> +		if (need_full_reset && amdgpu_need_backup(adev)) {
> +			struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> +			struct amdgpu_bo *bo, *tmp;
> +			struct fence *fence = NULL;
> +
> +			DRM_INFO("recover vram bo from shadow\n");
> +			mutex_lock(&adev->shadow_list_lock);
> +			list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> +				amdgpu_recover_vram_from_shadow(adev, ring, bo, &fence);

I think we need to make sure we don't directly submit to many jobs at 
the same time here.

E.g. do something like

struct amdgpu_fence *fence = NULL, *next;

amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);

if (fence)
     fence_wait(fence, false);
fence_put(fence);
fence = next;

This way we always have a maximum of two IBs in flight.

> +			}
> +			mutex_unlock(&adev->shadow_list_lock);
> +			if (fence) {
> +				r = fence_wait(fence, true);

The kernel thread executing this should never receive a signal, but just 
in case I suggest to not wait interruptible here.

> +				if (r)
> +					WARN(r, "recovery from shadow isn't comleted\n");
> +			}
> +			fence_put(fence);
> +		}
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = adev->rings[i];
>   			if (!ring)
>   				continue;
> +
> +			DRM_INFO("ring:%d recover jobs\n", ring->idx);

This change looks unrelated and should probably not be part of this patch.

Regards,
Christian.

>   			amd_sched_job_recovery(&ring->sched);
>   			kthread_unpark(ring->sched.thread);
>   		}