[PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset

Monk.Liu@xxxxxxx (Liu, Monk) · Mon, 9 Oct 2017 09:14:47 +0000

I can assure you this loose mode is 100% fit with current MESA,

Can you illustrate which points breaks MESA ?

You can see the whole logic only interested in the guilty ctx, and only the guilty ctx would receive the -ENODEV error

All innocent/regular running MESA client like X server and compositor eve didn't aware of a gpu reset at all, they just keep running 

BR  Monk

-----Original Message-----
From: Koenig, Christian 
Sent: 2017å¹´10æ??9æ?¥ 17:04
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset

Well I'm not saying that the app needs to repeatedly call amdgpu_ctx_query, but rather that we need a complete concept.

See, the upstream kernel driver is made for Mesa and not the closed source driver stack.

I can't 100% judge if this approach wouldn't work with Mesa because we haven't implemented it there, but it strongly looks like that stuff won't work.

So I need a solution which works with Mesa and the open source components before we can push it upstream.

Regards,
Christian.

Am 09.10.2017 um 10:39 schrieb Liu, Monk:
> How APP/UMD aware that a context is guilty or triggered too much loops of hang ??
>
> Why APP/UMD voluntarily call amdgpu_ctx_query() to check whether gpu reset occurred or not ?
>
> Please be aware that for another CSP customer this "loose mode" is 
> 100% welcome and wanted by they, and more important
>
> This approach won't cross X server at all, only the guilty 
> process/context is rejected upon its submitting
>
>
> I don't agree that we should rely on ctx_query(), no one is 
> responsible to call it from time to time
>
>
>
> -----Original Message-----
> From: Christian KÃ¶nig [mailto:ckoenig.leichtzumerken at gmail.com]
> Sent: 2017å¹´10æ??9æ?¥ 16:28
> To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for 
> loose reset
>
> Am 30.09.2017 um 08:03 schrieb Monk Liu:
>> Change-Id: I7904f362aa0f578a5cbf5d40c7a242c2c6680a92
>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> NAK, if a context is guilty of a GPU reset should be determined in
> amdgpu_ctx_query() by looking at the fences in the ring buffer.
>
> Not when the GPU reset itself occurs.
>
> Regards,
> Christian.
>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 16 +++++++++-------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       |  1 +
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 22 ++++++++++++++++++++++
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  1 +
>>    5 files changed, 34 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b40d4ba..b63e602 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -737,6 +737,7 @@ struct amdgpu_ctx {
>>    	struct dma_fence	**fences;
>>    	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
>>    	bool preamble_presented;
>> +	bool guilty;
>>    };
>>    
>>    struct amdgpu_ctx_mgr {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 6a1515e..f92962e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -79,16 +79,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>    	if (cs->in.num_chunks == 0)
>>    		return 0;
>>    
>> +	p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
>> +	if (!p->ctx)
>> +		return -EINVAL;
>> +
>> +	if (amdgpu_sriov_vf(p->adev) &&
>> +		amdgpu_sriov_reset_level == 0 &&
>> +		p->ctx->guilty)
>> +		return -ENODEV;
>> +
>>    	chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL);
>>    	if (!chunk_array)
>>    		return -ENOMEM;
>>    
>> -	p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
>> -	if (!p->ctx) {
>> -		ret = -EINVAL;
>> -		goto free_chunk;
>> -	}
>> -
>>    	/* get chunks */
>>    	chunk_array_user = u64_to_user_ptr(cs->in.chunks);
>>    	if (copy_from_user(chunk_array, chunk_array_user, @@ -184,7 
>> +187,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>    	p->nchunks = 0;
>>    put_ctx:
>>    	amdgpu_ctx_put(p->ctx);
>> -free_chunk:
>>    	kfree(chunk_array);
>>    
>>    	return ret;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 75c933b..028e9f1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -60,6 +60,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
>>    					  rq, amdgpu_sched_jobs);
>>    		if (r)
>>    			goto failed;
>> +		ctx->rings[i].entity.guilty = &ctx->guilty;
>>    	}
>>    
>>    	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git 
>> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> index 12c3092..89b0573 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> @@ -493,10 +493,32 @@ void amd_sched_set_queue_hang(struct amd_gpu_scheduler *sched)
>>    void amd_sched_job_kickout(struct amd_sched_job *s_job)
>>    {
>>    	struct amd_gpu_scheduler *sched = s_job->sched;
>> +	struct amd_sched_entity *entity, *tmp;
>> +	struct amd_sched_rq *rq;
>> +	int i;
>> +	bool found;
>>    
>>    	spin_lock(&sched->job_list_lock);
>>    	list_del_init(&s_job->node);
>>    	spin_unlock(&sched->job_list_lock);
>> +
>> +	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>> +
>> +	for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++) {
>> +		rq = &sched->sched_rq[i];
>> +
>> +		spin_lock(&rq->lock);
>> +		list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>> +			if (s_job->s_entity == entity && entity->guilty) {
>> +				*entity->guilty = true;
>> +				found = true;
>> +				break;
>> +			}
>> +		}
>> +		spin_unlock(&rq->lock);
>> +		if (found)
>> +			break;
>> +	}
>>    }
>>    
>>    void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) diff 
>> --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index f0242aa..16c2244 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -49,6 +49,7 @@ struct amd_sched_entity {
>>    
>>    	struct dma_fence		*dependency;
>>    	struct dma_fence_cb		cb;
>> +	bool *guilty; /* this points to ctx's guilty */
>>    };
>>    
>>    /**
>