+dri-devel Please be sure to cc dri-devel when you send out gpu scheduler patches. On Thu, Mar 11, 2021 at 10:57 PM Jack Zhang <Jack.Zhang1@xxxxxxx> wrote: > > re-insert Bailing jobs to avoid memory leak. > > Signed-off-by: Jack Zhang <Jack.Zhang1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 ++++++-- > drivers/gpu/drm/scheduler/sched_main.c | 8 +++++++- > include/drm/gpu_scheduler.h | 1 + > 4 files changed, 17 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 79b9cc73763f..86463b0f936e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4815,8 +4815,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > job ? job->base.id : -1); > > /* even we skipped this reset, still need to set the job to guilty */ > - if (job) > + if (job) { > drm_sched_increase_karma(&job->base); > + r = DRM_GPU_SCHED_STAT_BAILING; > + } > goto skip_recovery; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 759b34799221..41390bdacd9e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -34,6 +34,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > struct amdgpu_job *job = to_amdgpu_job(s_job); > struct amdgpu_task_info ti; > struct amdgpu_device *adev = ring->adev; > + int ret; > > memset(&ti, 0, sizeof(struct amdgpu_task_info)); > > @@ -52,8 +53,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > ti.process_name, ti.tgid, ti.task_name, ti.pid); > > if (amdgpu_device_should_recover_gpu(ring->adev)) { > - amdgpu_device_gpu_recover(ring->adev, job); > - return DRM_GPU_SCHED_STAT_NOMINAL; > + ret = amdgpu_device_gpu_recover(ring->adev, job); > + if (ret == DRM_GPU_SCHED_STAT_BAILING) > + return DRM_GPU_SCHED_STAT_BAILING; > + else > + return DRM_GPU_SCHED_STAT_NOMINAL; > } else { > drm_sched_suspend_timeout(&ring->sched); > if (amdgpu_sriov_vf(adev)) > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 92d8de24d0a1..a44f621fb5c4 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work) > { > struct drm_gpu_scheduler *sched; > struct drm_sched_job *job; > + int ret; > > sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work); > > @@ -331,8 +332,13 @@ static void drm_sched_job_timedout(struct work_struct *work) > list_del_init(&job->list); > spin_unlock(&sched->job_list_lock); > > - job->sched->ops->timedout_job(job); > + ret = job->sched->ops->timedout_job(job); > > + if (ret == DRM_GPU_SCHED_STAT_BAILING) { > + spin_lock(&sched->job_list_lock); > + list_add(&job->node, &sched->ring_mirror_list); > + spin_unlock(&sched->job_list_lock); > + } > /* > * Guilty job did complete and hence needs to be manually removed > * See drm_sched_stop doc. > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 4ea8606d91fe..8093ac2427ef 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -210,6 +210,7 @@ enum drm_gpu_sched_stat { > DRM_GPU_SCHED_STAT_NONE, /* Reserve 0 */ > DRM_GPU_SCHED_STAT_NOMINAL, > DRM_GPU_SCHED_STAT_ENODEV, > + DRM_GPU_SCHED_STAT_BAILING, > }; > > /** > -- > 2.25.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel