[PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.

andrey.grodzovsky@xxxxxxx (Andrey Grodzovsky) · Fri, 20 Oct 2017 12:51:34 -0400

On 2017-10-20 12:26 PM, Andres Rodriguez wrote:
>
>
> On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote:
>>
>>
>> On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>>>
>>>
>>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the 
>>>> allocated
>>>> amdgpu_ctx (and the entity inside it) were already deallocated from
>>>> amdgpu_cs_parser_fini.
>>>>
>>>> Fix: Save job's priority on it's creation instead of accessing it from
>>>> s_entity later on.
>>>
>>> I'm not sure if this is the correct approach for a fix.
>>>
>>> Keeping s_entity as a dangling pointer could result in a similar 
>>> bugs being reintroduced. For example, there would still be a race 
>>> condition between amdgpu_cs_parser_fini() and amdgpu_job_dependency().
>>
>> .dependency hook is called only in once place 
>> amd_sched_entity_pop_job, amdgpu_cs_parser_fini will wait  (from 
>> amd_sched_entity_fini) for wake_up(&sched->job_scheduled) from 
>> amd_sched_main so I don't see a race here.
>>
>>>
>>>
>>> Instead, it might be better for the job to grab a reference to the 
>>> context during job_init(), and put that reference on job free.
>>
>> Originally it was my thinkimg to, but I consulted Christian and he 
>> advised that quote - "it's not the best idea since
>> the problem is that when we terminate a process we need to make sure 
>> that all resources are released or at least not hold for much longer. 
>> When we keep the ctx alive with the job we need to also keep the VM 
>> alive and that means we need to keep all the VM page tables alive".
>>
>
> That makes sense.
>
> Since s_entity is tied to the context reference held by the parser, 
> can you set it to NULL when you drop the context reference there?

O am not sure i understand - you want to send s_job->s_entity to NULL in 
amd_sched_entity_fini for each remaining job in the queue ? But all the 
jobs remaining in the queue are destroyed there anyway.

Thanks,
Andrey

>
> At least that way we can easily detect misuse of s_entity after it 
> enters a "possibly deleted" state.
>
> Regards,
> Andres
>
>> Thanks,
>> Andrey
>>
>>>
>>> Regards,
>>> Andres
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>>>> ++++++++++++---------------
>>>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index f7fceb6..a760b6e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>>>> amdgpu_cs_parser *p,
>>>>       job->uf_sequence = seq;
>>>>         amdgpu_job_free_resources(job);
>>>> -    amdgpu_ring_priority_get(job->ring,
>>>> - amd_sched_get_job_priority(&job->base));
>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>         trace_amdgpu_cs_ioctl(job);
>>>>       amd_sched_entity_push_job(&job->base);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 0cfc68d..a58e3c5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>>>> amd_sched_job *s_job)
>>>>   {
>>>>       struct amdgpu_job *job = container_of(s_job, struct 
>>>> amdgpu_job, base);
>>>>   -    amdgpu_ring_priority_put(job->ring, 
>>>> amd_sched_get_job_priority(s_job));
>>>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>>>       dma_fence_put(job->fence);
>>>>       amdgpu_sync_free(&job->sync);
>>>>       amdgpu_sync_free(&job->dep_sync);
>>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>>>> struct amdgpu_ring *ring,
>>>>       job->fence_ctx = entity->fence_context;
>>>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>>>       amdgpu_job_free_resources(job);
>>>> -    amdgpu_ring_priority_get(job->ring,
>>>> - amd_sched_get_job_priority(&job->base));
>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>       amd_sched_entity_push_job(&job->base);
>>>>         return 0;
>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> index e4d3b4e..1bbbce2 100644
>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>>>   {
>>>>       job->sched = sched;
>>>>       job->s_entity = entity;
>>>> +    job->s_priority = entity->rq - sched->sched_rq;
>>>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>>>       if (!job->s_fence)
>>>>           return -ENOMEM;
>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> index 52c8e54..3f75b45 100644
>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> @@ -30,6 +30,19 @@
>>>>   struct amd_gpu_scheduler;
>>>>   struct amd_sched_rq;
>>>>   +enum amd_sched_priority {
>>>> +    AMD_SCHED_PRIORITY_MIN,
>>>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>> +    AMD_SCHED_PRIORITY_NORMAL,
>>>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>>>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>>>> +    AMD_SCHED_PRIORITY_KERNEL,
>>>> +    AMD_SCHED_PRIORITY_MAX,
>>>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>>>> +    AMD_SCHED_PRIORITY_UNSET = -2
>>>> +};
>>>> +
>>>> +
>>>>   /**
>>>>    * A scheduler entity is a wrapper around a job queue or a group
>>>>    * of other entities. Entities take turns emitting jobs from their
>>>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>>>       struct delayed_work        work_tdr;
>>>>       uint64_t            id;
>>>>       atomic_t karma;
>>>> +    enum amd_sched_priority s_priority;
>>>>   };
>>>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>>>       void (*free_job)(struct amd_sched_job *sched_job);
>>>>   };
>>>>   -enum amd_sched_priority {
>>>> -    AMD_SCHED_PRIORITY_MIN,
>>>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>> -    AMD_SCHED_PRIORITY_NORMAL,
>>>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>>>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>>>> -    AMD_SCHED_PRIORITY_KERNEL,
>>>> -    AMD_SCHED_PRIORITY_MAX,
>>>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>>>> -    AMD_SCHED_PRIORITY_UNSET = -2
>>>> -};
>>>> -
>>>>   /**
>>>>    * One scheduler is implemented for each hardware ring
>>>>   */
>>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>>>> dma_fence* fence,
>>>>                       struct amd_sched_entity *entity);
>>>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>>>   -static inline enum amd_sched_priority
>>>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>>>> -{
>>>> -    return (job->s_entity->rq - job->sched->sched_rq);
>>>> -}
>>>> -
>>>>   #endif
>>>>
>>