Am 20.10.2017 um 18:51 schrieb Andrey Grodzovsky: > > > On 2017-10-20 12:26 PM, Andres Rodriguez wrote: >> >> >> On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote: >>> >>> >>> On 2017-10-20 11:59 AM, Andres Rodriguez wrote: >>>> >>>> >>>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote: >>>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the >>>>> allocated >>>>> amdgpu_ctx (and the entity inside it) were already deallocated from >>>>> amdgpu_cs_parser_fini. >>>>> >>>>> Fix: Save job's priority on it's creation instead of accessing it >>>>> from >>>>> s_entity later on. >>>> >>>> I'm not sure if this is the correct approach for a fix. >>>> >>>> Keeping s_entity as a dangling pointer could result in a similar >>>> bugs being reintroduced. For example, there would still be a race >>>> condition between amdgpu_cs_parser_fini() and amdgpu_job_dependency(). >>> >>> .dependency hook is called only in once place >>> amd_sched_entity_pop_job, amdgpu_cs_parser_fini will wait (from >>> amd_sched_entity_fini) for wake_up(&sched->job_scheduled) from >>> amd_sched_main so I don't see a race here. >>> >>>> >>>> >>>> Instead, it might be better for the job to grab a reference to the >>>> context during job_init(), and put that reference on job free. >>> >>> Originally it was my thinkimg to, but I consulted Christian and he >>> advised that quote - "it's not the best idea since >>> the problem is that when we terminate a process we need to make sure >>> that all resources are released or at least not hold for much >>> longer. When we keep the ctx alive with the job we need to also keep >>> the VM alive and that means we need to keep all the VM page tables >>> alive". >>> >> >> That makes sense. >> >> Since s_entity is tied to the context reference held by the parser, >> can you set it to NULL when you drop the context reference there? > > O am not sure i understand - you want to send s_job->s_entity to NULL > in amd_sched_entity_fini for each remaining job in the queue ? But all > the jobs remaining in the queue are destroyed there anyway. I think what Andres means here is exactly what we planned to do anyway. Set job->s_entity to NULL as soon as we know that the entity is not used any more and might be released. In the long term we should target towards making s_job->s_entity as well as job->vm superfluous. This way we could even push remaining jobs on a graveyard entity when we destroy one and timeout. Alternatively we could look into why wait_event_killable is sometimes not killable as the name says :) Maybe we can get to a point where we can finally reboot the system cleanly even when the GPU is stuck. Regards, Christian. > > Thanks, > Andrey > >> >> At least that way we can easily detect misuse of s_entity after it >> enters a "possibly deleted" state. >> >> Regards, >> Andres >> >>> Thanks, >>> Andrey >>> >>>> >>>> Regards, >>>> Andres >>>> >>>>> >>>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com> >>>>> --- >>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c       | 3 +-- >>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c      | 5 ++--- >>>>>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 1 + >>>>>  drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 >>>>> ++++++++++++--------------- >>>>>  4 files changed, 18 insertions(+), 23 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>>>> index f7fceb6..a760b6e 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct >>>>> amdgpu_cs_parser *p, >>>>>      job->uf_sequence = seq; >>>>>        amdgpu_job_free_resources(job); >>>>> -   amdgpu_ring_priority_get(job->ring, >>>>> - amd_sched_get_job_priority(&job->base)); >>>>> +   amdgpu_ring_priority_get(job->ring, job->base.s_priority); >>>>>        trace_amdgpu_cs_ioctl(job); >>>>>      amd_sched_entity_push_job(&job->base); >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>>>> index 0cfc68d..a58e3c5 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct >>>>> amd_sched_job *s_job) >>>>>  { >>>>>      struct amdgpu_job *job = container_of(s_job, struct >>>>> amdgpu_job, base); >>>>>  -   amdgpu_ring_priority_put(job->ring, >>>>> amd_sched_get_job_priority(s_job)); >>>>> +   amdgpu_ring_priority_put(job->ring, s_job->s_priority); >>>>>      dma_fence_put(job->fence); >>>>>      amdgpu_sync_free(&job->sync); >>>>>      amdgpu_sync_free(&job->dep_sync); >>>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, >>>>> struct amdgpu_ring *ring, >>>>>      job->fence_ctx = entity->fence_context; >>>>>      *f = dma_fence_get(&job->base.s_fence->finished); >>>>>      amdgpu_job_free_resources(job); >>>>> -   amdgpu_ring_priority_get(job->ring, >>>>> - amd_sched_get_job_priority(&job->base)); >>>>> +   amdgpu_ring_priority_get(job->ring, job->base.s_priority); >>>>>      amd_sched_entity_push_job(&job->base); >>>>>        return 0; >>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c >>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c >>>>> index e4d3b4e..1bbbce2 100644 >>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c >>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c >>>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job, >>>>>  { >>>>>      job->sched = sched; >>>>>      job->s_entity = entity; >>>>> +   job->s_priority = entity->rq - sched->sched_rq; >>>>>      job->s_fence = amd_sched_fence_create(entity, owner); >>>>>      if (!job->s_fence) >>>>>          return -ENOMEM; >>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h >>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h >>>>> index 52c8e54..3f75b45 100644 >>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h >>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h >>>>> @@ -30,6 +30,19 @@ >>>>>  struct amd_gpu_scheduler; >>>>>  struct amd_sched_rq; >>>>>  +enum amd_sched_priority { >>>>> +   AMD_SCHED_PRIORITY_MIN, >>>>> +   AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN, >>>>> +   AMD_SCHED_PRIORITY_NORMAL, >>>>> +   AMD_SCHED_PRIORITY_HIGH_SW, >>>>> +   AMD_SCHED_PRIORITY_HIGH_HW, >>>>> +   AMD_SCHED_PRIORITY_KERNEL, >>>>> +   AMD_SCHED_PRIORITY_MAX, >>>>> +   AMD_SCHED_PRIORITY_INVALID = -1, >>>>> +   AMD_SCHED_PRIORITY_UNSET = -2 >>>>> +}; >>>>> + >>>>> + >>>>>  /** >>>>>   * A scheduler entity is a wrapper around a job queue or a group >>>>>   * of other entities. Entities take turns emitting jobs from their >>>>> @@ -83,6 +96,7 @@ struct amd_sched_job { >>>>>      struct delayed_work       work_tdr; >>>>>      uint64_t           id; >>>>>      atomic_t karma; >>>>> +   enum amd_sched_priority s_priority; >>>>>  }; >>>>>    extern const struct dma_fence_ops amd_sched_fence_ops_scheduled; >>>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops { >>>>>      void (*free_job)(struct amd_sched_job *sched_job); >>>>>  }; >>>>>  -enum amd_sched_priority { >>>>> -   AMD_SCHED_PRIORITY_MIN, >>>>> -   AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN, >>>>> -   AMD_SCHED_PRIORITY_NORMAL, >>>>> -   AMD_SCHED_PRIORITY_HIGH_SW, >>>>> -   AMD_SCHED_PRIORITY_HIGH_HW, >>>>> -   AMD_SCHED_PRIORITY_KERNEL, >>>>> -   AMD_SCHED_PRIORITY_MAX, >>>>> -   AMD_SCHED_PRIORITY_INVALID = -1, >>>>> -   AMD_SCHED_PRIORITY_UNSET = -2 >>>>> -}; >>>>> - >>>>>  /** >>>>>   * One scheduler is implemented for each hardware ring >>>>>  */ >>>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct >>>>> dma_fence* fence, >>>>>                      struct amd_sched_entity *entity); >>>>>  void amd_sched_job_kickout(struct amd_sched_job *s_job); >>>>>  -static inline enum amd_sched_priority >>>>> -amd_sched_get_job_priority(struct amd_sched_job *job) >>>>> -{ >>>>> -   return (job->s_entity->rq - job->sched->sched_rq); >>>>> -} >>>>> - >>>>>  #endif >>>>> >>> > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx