Re: [PATCH 1/1] drm/scheduler: Job timeout handler returns status (v2)

Lucas Stach <l.stach@xxxxxxxxxxxxxx> · Thu, 10 Dec 2020 10:31:19 +0100

Hi Luben,

Am Mittwoch, den 09.12.2020, 21:14 -0500 schrieb Luben Tuikov:
> This patch does not change current behaviour.
> 
> The driver's job timeout handler now returns
> status indicating back to the DRM layer whether
> the task (job) was successfully aborted or whether
> more time should be given to the task to complete.
> 
> Default behaviour as of this patch, is preserved,
> except in obvious-by-comment case in the Panfrost
> driver, as documented below.
> 
> All drivers which make use of the
> drm_sched_backend_ops' .timedout_job() callback
> have been accordingly renamed and return the
> would've-been default value of
> DRM_TASK_STATUS_ALIVE to restart the task's
> timeout timer--this is the old behaviour, and
> is preserved by this patch.
> 
> In the case of the Panfrost driver, its timedout
> callback correctly first checks if the job had
> completed in due time and if so, it now returns
> DRM_TASK_STATUS_COMPLETE to notify the DRM layer
> that the task can be moved to the done list, to be
> freed later. In the other two subsequent checks,
> the value of DRM_TASK_STATUS_ALIVE is returned, as
> per the default behaviour.
> 
> A more involved driver's solutions can be had
> in subequent patches.
> 
> v2: Use enum as the status of a driver's job
>     timeout callback method.
> 
> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
> Cc: Christian König <christian.koenig@xxxxxxx>
> Cc: Daniel Vetter <daniel.vetter@xxxxxxxx>
> Cc: Lucas Stach <l.stach@xxxxxxxxxxxxxx>
> Cc: Russell King <linux+etnaviv@xxxxxxxxxxxxxxx>
> Cc: Christian Gmeiner <christian.gmeiner@xxxxxxxxx>
> Cc: Qiang Yu <yuq825@xxxxxxxxx>
> Cc: Rob Herring <robh@xxxxxxxxxx>
> Cc: Tomeu Vizoso <tomeu.vizoso@xxxxxxxxxxxxx>
> Cc: Steven Price <steven.price@xxxxxxx>
> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@xxxxxxxxxxxxx>
> Cc: Eric Anholt <eric@xxxxxxxxxx>
> Reported-by: kernel test robot <lkp@xxxxxxxxx>
> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
>  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++++++-
>  drivers/gpu/drm/lima/lima_sched.c       |  4 +++-
>  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ++++---
>  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
>  drivers/gpu/drm/v3d/v3d_sched.c         | 32 +++++++++++++------------
>  include/drm/gpu_scheduler.h             | 20 +++++++++++++---
>  7 files changed, 57 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ff48101bab55..a111326cbdde 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -28,7 +28,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_trace.h"
>  
> 
> 
> 
> -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> +static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
>  {
>  	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>  	struct amdgpu_job *job = to_amdgpu_job(s_job);
> @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>  	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>  		DRM_ERROR("ring %s timeout, but soft recovered\n",
>  			  s_job->sched->name);
> -		return;
> +		return DRM_TASK_STATUS_ALIVE;
>  	}
>  
> 
> 
> 
>  	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>  
> 
> 
> 
>  	if (amdgpu_device_should_recover_gpu(ring->adev)) {
>  		amdgpu_device_gpu_recover(ring->adev, job);
> +		return DRM_TASK_STATUS_ALIVE;
>  	} else {
>  		drm_sched_suspend_timeout(&ring->sched);
>  		if (amdgpu_sriov_vf(adev))
>  			adev->virt.tdr_debug = true;
> +		return DRM_TASK_STATUS_ALIVE;
>  	}
>  }
>  
> 
> 
> 
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index cd46c882269c..c49516942328 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
>  	return fence;
>  }
>  
> 
> 
> 
> -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
> +static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job
> +						       *sched_job)
>  {
>  	struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
>  	struct etnaviv_gpu *gpu = submit->gpu;
> @@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
>  
>  	drm_sched_resubmit_jobs(&gpu->sched);
>  
> +	/* Tell the DRM scheduler that this task needs
> +	 * more time.
> +	 */

This comment doesn't match the kernel coding style, but it's also moot
as the whole added code block isn't needed. The code just below is
identical, so letting execution continue here instead of returning
would be the right thing to do, but maybe you mean to return
DRM_TASK_STATUS_COMPLETE? It's a bit confusing that aborted and job
successfully finished should be signaled with the same return code.

> +	drm_sched_start(&gpu->sched, true);
> +	return DRM_TASK_STATUS_ALIVE;
> +
>  out_no_timeout:
>  	/* restart scheduler after GPU is usable again */
>  	drm_sched_start(&gpu->sched, true);
> +	return DRM_TASK_STATUS_ALIVE;
>  }

Regards,
Lucas

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx