[PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset

ckoenig.leichtzumerken@xxxxxxxxx (Christian König) · Mon, 9 Oct 2017 11:24:12 +0200

> Can you illustrate which points breaks MESA ?
It doesn't break Mesa, but there is not handling in Mesa for -ENODEV.

So I will block adding any kernel functionality which returns -ENODEV 
before Mesa gets a proper handling for this.

We need to implement feature in Mesa first, it is our primary user space 
client. Without having handling there we can't submit anything upstream.

Regards,
Christian.

Am 09.10.2017 um 11:14 schrieb Liu, Monk:
> I can assure you this loose mode is 100% fit with current MESA,
>
> Can you illustrate which points breaks MESA ?
>
> You can see the whole logic only interested in the guilty ctx, and only the guilty ctx would receive the -ENODEV error
>
> All innocent/regular running MESA client like X server and compositor eve didn't aware of a gpu reset at all, they just keep running
>
>
> BR  Monk
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: 2017å¹´10æ??9æ?¥ 17:04
> To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset
>
> Well I'm not saying that the app needs to repeatedly call amdgpu_ctx_query, but rather that we need a complete concept.
>
> See, the upstream kernel driver is made for Mesa and not the closed source driver stack.
>
> I can't 100% judge if this approach wouldn't work with Mesa because we haven't implemented it there, but it strongly looks like that stuff won't work.
>
> So I need a solution which works with Mesa and the open source components before we can push it upstream.
>
> Regards,
> Christian.
>
> Am 09.10.2017 um 10:39 schrieb Liu, Monk:
>> How APP/UMD aware that a context is guilty or triggered too much loops of hang ??
>>
>> Why APP/UMD voluntarily call amdgpu_ctx_query() to check whether gpu reset occurred or not ?
>>
>> Please be aware that for another CSP customer this "loose mode" is
>> 100% welcome and wanted by they, and more important
>>
>> This approach won't cross X server at all, only the guilty
>> process/context is rejected upon its submitting
>>
>>
>> I don't agree that we should rely on ctx_query(), no one is
>> responsible to call it from time to time
>>
>>
>>
>> -----Original Message-----
>> From: Christian KÃ¶nig [mailto:ckoenig.leichtzumerken at gmail.com]
>> Sent: 2017å¹´10æ??9æ?¥ 16:28
>> To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
>> Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for
>> loose reset
>>
>> Am 30.09.2017 um 08:03 schrieb Monk Liu:
>>> Change-Id: I7904f362aa0f578a5cbf5d40c7a242c2c6680a92
>>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>> NAK, if a context is guilty of a GPU reset should be determined in
>> amdgpu_ctx_query() by looking at the fences in the ring buffer.
>>
>> Not when the GPU reset itself occurs.
>>
>> Regards,
>> Christian.
>>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 16 +++++++++-------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       |  1 +
>>>     drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 22 ++++++++++++++++++++++
>>>     drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  1 +
>>>     5 files changed, 34 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b40d4ba..b63e602 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -737,6 +737,7 @@ struct amdgpu_ctx {
>>>     	struct dma_fence	**fences;
>>>     	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
>>>     	bool preamble_presented;
>>> +	bool guilty;
>>>     };
>>>     
>>>     struct amdgpu_ctx_mgr {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 6a1515e..f92962e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -79,16 +79,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>>     	if (cs->in.num_chunks == 0)
>>>     		return 0;
>>>     
>>> +	p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
>>> +	if (!p->ctx)
>>> +		return -EINVAL;
>>> +
>>> +	if (amdgpu_sriov_vf(p->adev) &&
>>> +		amdgpu_sriov_reset_level == 0 &&
>>> +		p->ctx->guilty)
>>> +		return -ENODEV;
>>> +
>>>     	chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL);
>>>     	if (!chunk_array)
>>>     		return -ENOMEM;
>>>     
>>> -	p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
>>> -	if (!p->ctx) {
>>> -		ret = -EINVAL;
>>> -		goto free_chunk;
>>> -	}
>>> -
>>>     	/* get chunks */
>>>     	chunk_array_user = u64_to_user_ptr(cs->in.chunks);
>>>     	if (copy_from_user(chunk_array, chunk_array_user, @@ -184,7
>>> +187,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>>>     	p->nchunks = 0;
>>>     put_ctx:
>>>     	amdgpu_ctx_put(p->ctx);
>>> -free_chunk:
>>>     	kfree(chunk_array);
>>>     
>>>     	return ret;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> index 75c933b..028e9f1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> @@ -60,6 +60,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
>>>     					  rq, amdgpu_sched_jobs);
>>>     		if (r)
>>>     			goto failed;
>>> +		ctx->rings[i].entity.guilty = &ctx->guilty;
>>>     	}
>>>     
>>>     	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git
>>> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> index 12c3092..89b0573 100644
>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> @@ -493,10 +493,32 @@ void amd_sched_set_queue_hang(struct amd_gpu_scheduler *sched)
>>>     void amd_sched_job_kickout(struct amd_sched_job *s_job)
>>>     {
>>>     	struct amd_gpu_scheduler *sched = s_job->sched;
>>> +	struct amd_sched_entity *entity, *tmp;
>>> +	struct amd_sched_rq *rq;
>>> +	int i;
>>> +	bool found;
>>>     
>>>     	spin_lock(&sched->job_list_lock);
>>>     	list_del_init(&s_job->node);
>>>     	spin_unlock(&sched->job_list_lock);
>>> +
>>> +	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>>> +
>>> +	for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++) {
>>> +		rq = &sched->sched_rq[i];
>>> +
>>> +		spin_lock(&rq->lock);
>>> +		list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
>>> +			if (s_job->s_entity == entity && entity->guilty) {
>>> +				*entity->guilty = true;
>>> +				found = true;
>>> +				break;
>>> +			}
>>> +		}
>>> +		spin_unlock(&rq->lock);
>>> +		if (found)
>>> +			break;
>>> +	}
>>>     }
>>>     
>>>     void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) diff
>>> --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> index f0242aa..16c2244 100644
>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> @@ -49,6 +49,7 @@ struct amd_sched_entity {
>>>     
>>>     	struct dma_fence		*dependency;
>>>     	struct dma_fence_cb		cb;
>>> +	bool *guilty; /* this points to ctx's guilty */
>>>     };
>>>     
>>>     /**
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx