[PATCH 1/4] drm/amdgpu: stop all rings before doing gpu recover

ckoenig.leichtzumerken@xxxxxxxxx (Christian König) · Wed, 28 Feb 2018 13:20:40 +0100

Andrey please give this set a good testing as well.

Am 28.02.2018 um 08:21 schrieb Monk Liu:
> found recover_vram_from_shadow sometimes get executed
> in paralle with SDMA scheduler, should stop all
> schedulers before doing gpu reset/recover
>
> Change-Id: Ibaef3e3c015f3cf88f84b2eaf95cda95ae1a64e3
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>

For now this patch is Reviewed-by: Christian KÃ¶nig 
<christian.koenig at amd.com>.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +++++++++++-------------------
>   1 file changed, 15 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 75d1733..e9d81a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2649,22 +2649,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> +
>   	/* store modesetting */
>   	if (amdgpu_device_has_dc_support(adev))
>   		state = drm_atomic_helper_suspend(adev->ddev);
>   
> -	/* block scheduler */
> +	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
>   
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> -		/* only focus on the ring hit timeout if &job not NULL */
> +		kthread_park(ring->sched.thread);
> +
>   		if (job && job->ring->idx != i)
>   			continue;
>   
> -		kthread_park(ring->sched.thread);
>   		drm_sched_hw_job_reset(&ring->sched, &job->base);
>   
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> @@ -2707,33 +2708,22 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   			}
>   			dma_fence_put(fence);
>   		}
> +	}
>   
> -		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			struct amdgpu_ring *ring = adev->rings[i];
> -
> -			if (!ring || !ring->sched.thread)
> -				continue;
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
>   
> -			/* only focus on the ring hit timeout if &job not NULL */
> -			if (job && job->ring->idx != i)
> -				continue;
> +		if (!ring || !ring->sched.thread)
> +			continue;
>   
> +		/* only need recovery sched of the given job's ring
> +		 * or all rings (in the case @job is NULL)
> +		 * after above amdgpu_reset accomplished
> +		 */
> +		if ((!job || job->ring->idx == i) && !r)
>   			drm_sched_job_recovery(&ring->sched);
> -			kthread_unpark(ring->sched.thread);
> -		}
> -	} else {
> -		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			struct amdgpu_ring *ring = adev->rings[i];
>   
> -			if (!ring || !ring->sched.thread)
> -				continue;
> -
> -			/* only focus on the ring hit timeout if &job not NULL */
> -			if (job && job->ring->idx != i)
> -				continue;
> -
> -			kthread_unpark(adev->rings[i]->sched.thread);
> -		}
> +		kthread_unpark(ring->sched.thread);
>   	}
>   
>   	if (amdgpu_device_has_dc_support(adev)) {