Hi, Andrey/Crhistian and Team,
I didn't receive the reviewer's message from maintainers on panfrost
driver for several days.
Due to this patch is urgent for my current working project.
Would you please help to give some review ideas?
Many Thanks,
Jack
-----Original Message-----
From: Zhang, Jack (Jian)
Sent: Tuesday, March 16, 2021 3:20 PM
To: dri-devel@xxxxxxxxxxxxxxxxxxxxx; amd-gfx@xxxxxxxxxxxxxxxxxxxxx;
Koenig, Christian <Christian.Koenig@xxxxxxx>; Grodzovsky, Andrey
<Andrey.Grodzovsky@xxxxxxx>; Liu, Monk <Monk.Liu@xxxxxxx>; Deng,
Emily <Emily.Deng@xxxxxxx>; Rob Herring <robh@xxxxxxxxxx>; Tomeu
Vizoso <tomeu.vizoso@xxxxxxxxxxxxx>; Steven Price <steven.price@xxxxxxx>
Subject: RE: [PATCH v3] drm/scheduler re-insert Bailing job to avoid
memleak
[AMD Public Use]
Ping
-----Original Message-----
From: Zhang, Jack (Jian)
Sent: Monday, March 15, 2021 1:24 PM
To: Jack Zhang <Jack.Zhang1@xxxxxxx>;
dri-devel@xxxxxxxxxxxxxxxxxxxxx; amd-gfx@xxxxxxxxxxxxxxxxxxxxx;
Koenig, Christian <Christian.Koenig@xxxxxxx>; Grodzovsky, Andrey
<Andrey.Grodzovsky@xxxxxxx>; Liu, Monk <Monk.Liu@xxxxxxx>; Deng,
Emily <Emily.Deng@xxxxxxx>; Rob Herring <robh@xxxxxxxxxx>; Tomeu
Vizoso <tomeu.vizoso@xxxxxxxxxxxxx>; Steven Price <steven.price@xxxxxxx>
Subject: RE: [PATCH v3] drm/scheduler re-insert Bailing job to avoid
memleak
[AMD Public Use]
Hi, Rob/Tomeu/Steven,
Would you please help to review this patch for panfrost driver?
Thanks,
Jack Zhang
-----Original Message-----
From: Jack Zhang <Jack.Zhang1@xxxxxxx>
Sent: Monday, March 15, 2021 1:21 PM
To: dri-devel@xxxxxxxxxxxxxxxxxxxxx; amd-gfx@xxxxxxxxxxxxxxxxxxxxx;
Koenig, Christian <Christian.Koenig@xxxxxxx>; Grodzovsky, Andrey
<Andrey.Grodzovsky@xxxxxxx>; Liu, Monk <Monk.Liu@xxxxxxx>; Deng,
Emily <Emily.Deng@xxxxxxx>
Cc: Zhang, Jack (Jian) <Jack.Zhang1@xxxxxxx>
Subject: [PATCH v3] drm/scheduler re-insert Bailing job to avoid memleak
re-insert Bailing jobs to avoid memory leak.
V2: move re-insert step to drm/scheduler logic
V3: add panfrost's return value for bailing jobs in case it hits the
memleak issue.
Signed-off-by: Jack Zhang <Jack.Zhang1@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 ++++++--
drivers/gpu/drm/panfrost/panfrost_job.c | 4 ++--
drivers/gpu/drm/scheduler/sched_main.c | 8 +++++++-
include/drm/gpu_scheduler.h | 1 +
5 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 79b9cc73763f..86463b0f936e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4815,8 +4815,10 @@ int amdgpu_device_gpu_recover(struct
amdgpu_device *adev,
job ? job->base.id : -1);
/* even we skipped this reset, still need to set the job
to guilty */
- if (job)
+ if (job) {
drm_sched_increase_karma(&job->base);
+ r = DRM_GPU_SCHED_STAT_BAILING;
+ }
goto skip_recovery;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 759b34799221..41390bdacd9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -34,6 +34,7 @@ static enum drm_gpu_sched_stat
amdgpu_job_timedout(struct drm_sched_job *s_job)
struct amdgpu_job *job = to_amdgpu_job(s_job);
struct amdgpu_task_info ti;
struct amdgpu_device *adev = ring->adev;
+ int ret;
memset(&ti, 0, sizeof(struct amdgpu_task_info));
@@ -52,8 +53,11 @@ static enum drm_gpu_sched_stat
amdgpu_job_timedout(struct drm_sched_job *s_job)
ti.process_name, ti.tgid, ti.task_name, ti.pid);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
- amdgpu_device_gpu_recover(ring->adev, job);
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ ret = amdgpu_device_gpu_recover(ring->adev, job);
+ if (ret == DRM_GPU_SCHED_STAT_BAILING)
+ return DRM_GPU_SCHED_STAT_BAILING;
+ else
+ return DRM_GPU_SCHED_STAT_NOMINAL;
} else {
drm_sched_suspend_timeout(&ring->sched);
if (amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
b/drivers/gpu/drm/panfrost/panfrost_job.c
index 6003cfeb1322..e2cb4f32dae1 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -444,7 +444,7 @@ static enum drm_gpu_sched_stat
panfrost_job_timedout(struct drm_sched_job
* spurious. Bail out.
*/
if (dma_fence_is_signaled(job->done_fence))
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return DRM_GPU_SCHED_STAT_BAILING;
dev_err(pfdev->dev, "gpu sched timeout, js=%d, config=0x%x,
status=0x%x, head=0x%x, tail=0x%x, sched_job=%p",
js,
@@ -456,7 +456,7 @@ static enum drm_gpu_sched_stat
panfrost_job_timedout(struct drm_sched_job
/* Scheduler is already stopped, nothing to do. */
if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return DRM_GPU_SCHED_STAT_BAILING;
/* Schedule a reset if there's no reset in progress. */
if (!atomic_xchg(&pfdev->reset.pending, 1)) diff --git
a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index 92d8de24d0a1..a44f621fb5c4 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct
work_struct *work) {
struct drm_gpu_scheduler *sched;
struct drm_sched_job *job;
+ int ret;
sched = container_of(work, struct drm_gpu_scheduler,
work_tdr.work);
@@ -331,8 +332,13 @@ static void drm_sched_job_timedout(struct
work_struct *work)
list_del_init(&job->list);
spin_unlock(&sched->job_list_lock);
- job->sched->ops->timedout_job(job);
+ ret = job->sched->ops->timedout_job(job);
+ if (ret == DRM_GPU_SCHED_STAT_BAILING) {
+ spin_lock(&sched->job_list_lock);
+ list_add(&job->node, &sched->ring_mirror_list);
+ spin_unlock(&sched->job_list_lock);
+ }