[PATCH 1/5] drm/amdgpu:keep ctx alive till all job finished

Monk.Liu@xxxxxxx (Liu, Monk) · Wed, 3 May 2017 03:30:33 +0000

1, This is necessary otherwise how can I access entity pointer after a job timedout , and why it is dangerous ?
2, what's the status field in the fences you were referring to ? I need to judge if it could satisfy my requirement 

-----Original Message-----
From: Christian KÃ¶nig [mailto:deathsimple@xxxxxxxxxxx] 
Sent: Monday, May 01, 2017 10:48 PM
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 1/5] drm/amdgpu:keep ctx alive till all job finished

Am 01.05.2017 um 09:22 schrieb Monk Liu:
> for TDR guilty context feature, we need access ctx/s_entity field 
> member through sched job pointer,so ctx must keep alive till all job 
> from it signaled.

NAK, that is unnecessary and quite dangerous.

Instead we have the designed status field in the fences which should be checked for that.

Regards,
Christian.

>
> Change-Id: Ib87e9502f7a5c8c054c7e56956d7f7ad75998e43
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 6 +++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 9 +++++++--
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 6 ------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>   6 files changed, 23 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e330009..8e031d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -760,10 +760,12 @@ struct amdgpu_ib {
>   	uint32_t			flags;
>   };
>   
> +struct amdgpu_ctx;
> +
>   extern const struct amd_sched_backend_ops amdgpu_sched_ops;
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> -		     struct amdgpu_job **job, struct amdgpu_vm *vm);
> +		     struct amdgpu_job **job, struct amdgpu_vm *vm, struct 
> +amdgpu_ctx *ctx);
>   int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
>   			     struct amdgpu_job **job);
>   
> @@ -802,6 +804,7 @@ struct amdgpu_ctx_mgr {
>   
>   struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
>   int amdgpu_ctx_put(struct amdgpu_ctx *ctx);
> +struct amdgpu_ctx *amdgpu_ctx_kref_get(struct amdgpu_ctx *ctx);
>   
>   uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
>   			      struct fence *fence);
> @@ -1129,6 +1132,7 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sync;
>   	struct amdgpu_ib	*ibs;
>   	struct fence		*fence; /* the hw fence */
> +	struct amdgpu_ctx *ctx;
>   	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 699f5fe..267fb65 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -234,7 +234,7 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>   		}
>   	}
>   
> -	ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm);
> +	ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm, p->ctx);
>   	if (ret)
>   		goto free_all_kdata;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index b4bbbb3..81438af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -25,6 +25,13 @@
>   #include <drm/drmP.h>
>   #include "amdgpu.h"
>   
> +struct amdgpu_ctx *amdgpu_ctx_kref_get(struct amdgpu_ctx *ctx) {
> +	if (ctx)
> +		kref_get(&ctx->refcount);
> +	return ctx;
> +}
> +
>   static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
>   {
>   	unsigned i, j;
> @@ -56,6 +63,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
>   					  rq, amdgpu_sched_jobs);
>   		if (r)
>   			goto failed;
> +
> +		ctx->rings[i].entity.ptr_guilty = &ctx->guilty; /* kernel entity 
> +doesn't have ptr_guilty */
>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 690ef3d..208da11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -40,7 +40,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
>   }
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> -		     struct amdgpu_job **job, struct amdgpu_vm *vm)
> +		     struct amdgpu_job **job, struct amdgpu_vm *vm, struct 
> +amdgpu_ctx *ctx)
>   {
>   	size_t size = sizeof(struct amdgpu_job);
>   
> @@ -57,6 +57,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>   	(*job)->vm = vm;
>   	(*job)->ibs = (void *)&(*job)[1];
>   	(*job)->num_ibs = num_ibs;
> +	(*job)->ctx = amdgpu_ctx_kref_get(ctx);
>   
>   	amdgpu_sync_create(&(*job)->sync);
>   
> @@ -68,7 +69,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
>   {
>   	int r;
>   
> -	r = amdgpu_job_alloc(adev, 1, job, NULL);
> +	r = amdgpu_job_alloc(adev, 1, job, NULL, NULL);
>   	if (r)
>   		return r;
>   
> @@ -94,6 +95,10 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>   static void amdgpu_job_free_cb(struct amd_sched_job *s_job)
>   {
>   	struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, 
> base);
> +	struct amdgpu_ctx *ctx = job->ctx;
> +
> +	if (ctx)
> +		amdgpu_ctx_put(ctx);
>   
>   	fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 6f4e31f..9100ca8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -208,12 +208,6 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   	if (!amd_sched_entity_is_initialized(sched, entity))
>   		return;
>   
> -	/**
> -	 * The client will not queue more IBs during this fini, consume existing
> -	 * queued IBs
> -	*/
> -	wait_event(sched->job_scheduled, amd_sched_entity_is_idle(entity));
> -
>   	amd_sched_rq_remove_entity(rq, entity);
>   	kfifo_free(&entity->job_queue);
>   }
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 8cb41d3..ccbbcb0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -49,6 +49,7 @@ struct amd_sched_entity {
>   
>   	struct fence			*dependency;
>   	struct fence_cb			cb;
> +	bool *ptr_guilty;
>   };
>   
>   /**