---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++---
drivers/gpu/drm/etnaviv/etnaviv_sched.c | 8 +--
drivers/gpu/drm/scheduler/sched_main.c | 86 +++++++++++++++++++-----------
drivers/gpu/drm/v3d/v3d_sched.c | 11 ++--
include/drm/gpu_scheduler.h | 10 ++--
5 files changed, 83 insertions(+), 49 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef36cc5..42111d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
+ bool park_only = job && job->base.sched != &ring->sched;
if (!ring || !ring->sched.thread)
continue;
- kthread_park(ring->sched.thread);
+ drm_sched_stop(&ring->sched, job ? &job->base : NULL, park_only);
- if (job && job->base.sched != &ring->sched)
+ if (park_only)
continue;
- drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
-
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
int i;
+ bool unpark_only;
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
* or all rings (in the case @job is NULL)
* after above amdgpu_reset accomplished
*/
- if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res)
- drm_sched_job_recovery(&ring->sched);
+ unpark_only = (job && job->base.sched != &ring->sched) ||
+ adev->asic_reset_res;
+
+ if (!unpark_only)
+ drm_sched_resubmit_jobs(&ring->sched);
- kthread_unpark(ring->sched.thread);
+ drm_sched_start(&ring->sched, unpark_only);
}
if (!amdgpu_device_has_dc_support(adev)) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763..fab3b51 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
}
/* block scheduler */
- kthread_park(gpu->sched.thread);
- drm_sched_hw_job_reset(&gpu->sched, sched_job);
+ drm_sched_stop(&gpu->sched, sched_job, false);
/* get the GPU back into the init state */
etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
+ drm_sched_resubmit_jobs(&gpu->sched);
+
/* restart scheduler after GPU is usable again */
- drm_sched_job_recovery(&gpu->sched);
- kthread_unpark(gpu->sched.thread);
+ drm_sched_start(&gpu->sched);
}
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index dbb6906..8fb7f86 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
-
/**
* drm_sched_rq_init - initialize a given run queue struct
*
@@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct *work)
* @bad: bad scheduler job
*
*/
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad,
+ bool park_only)
{
struct drm_sched_job *s_job;
struct drm_sched_entity *entity, *tmp;
unsigned long flags;
+ struct list_head wait_list;
int i;
+ kthread_park(sched->thread);
+ if (park_only)
+ return;
+
+ INIT_LIST_HEAD(&wait_list);
+
spin_lock_irqsave(&sched->job_list_lock, flags);
list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
if (s_job->s_fence->parent &&
@@ -358,9 +364,24 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_jo
s_job->s_fence->parent = NULL;
atomic_dec(&sched->hw_rq_count);
}
+ else {
+ /* TODO Is it get/put neccessey here ? */
+ dma_fence_get(&s_job->s_fence->finished);
+ list_add(&s_job->finish_node, &wait_list);
+ }
}
spin_unlock_irqrestore(&sched->job_list_lock, flags);
+ /*
+ * Verify all the signaled jobs in mirror list are removed from the ring
+ * We rely on the fact that any finish_work in progress will wait for this
+ * handler to complete before releasing all of the jobs we iterate.
+ */
+ list_for_each_entry(s_job, &wait_list, finish_node) {
+ dma_fence_wait(&s_job->s_fence->finished, false);
+ dma_fence_put(&s_job->s_fence->finished);
+ }
+
if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
atomic_inc(&bad->karma);
/* don't increase @bad's karma if it's from KERNEL RQ,
@@ -385,7 +406,7 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_jo
}
}
}
-EXPORT_SYMBOL(drm_sched_hw_job_reset);
+EXPORT_SYMBOL(drm_sched_stop);
/**
* drm_sched_job_recovery - recover jobs after a reset
@@ -393,14 +414,17 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset);
* @sched: scheduler instance
*
*/
-void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool unpark_only)
{
struct drm_sched_job *s_job, *tmp;
bool found_guilty = false;
unsigned long flags;
int r;
- spin_lock_irqsave(&sched->job_list_lock, flags);
+ if (unpark_only)
+ goto unpark;
+
+ spin_lock_irqsave(&sched->job_list_lock, flags);
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
struct drm_sched_fence *s_fence = s_job->s_fence;
struct dma_fence *fence;
@@ -414,12 +438,9 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
dma_fence_set_error(&s_fence->finished, -ECANCELED);
- spin_unlock_irqrestore(&sched->job_list_lock, flags);
- fence = sched->ops->run_job(s_job);
- atomic_inc(&sched->hw_rq_count);
+ fence = s_job->s_fence->parent;
if (fence) {
- s_fence->parent = dma_fence_get(fence);
r = dma_fence_add_callback(fence, &s_fence->cb,
drm_sched_process_job);
if (r == -ENOENT)
@@ -427,18 +448,35 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
else if (r)
DRM_ERROR("fence add callback failed (%d)\n",
r);
- dma_fence_put(fence);
- } else {
- if (s_fence->finished.error < 0)
- drm_sched_expel_job_unlocked(s_job);
+ } else
drm_sched_process_job(NULL, &s_fence->cb);
- }
- spin_lock_irqsave(&sched->job_list_lock, flags);
}
+
drm_sched_start_timeout(sched);
spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
+unpark:
+ kthread_unpark(sched->thread);
}
-EXPORT_SYMBOL(drm_sched_job_recovery);
+EXPORT_SYMBOL(drm_sched_start);
+
+/**
+ * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list
+ *
+ * @sched: scheduler instance
+ *
+ */
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
+{
+ struct drm_sched_job *s_job, *tmp;
+
+ /*TODO DO we need spinlock here ? */
+ list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
+ s_job->s_fence->parent = sched->ops->run_job(s_job);
+ atomic_inc(&sched->hw_rq_count);
+ }
+}
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
/**
* drm_sched_job_init - init a scheduler job
@@ -634,26 +672,14 @@ static int drm_sched_main(void *param)
DRM_ERROR("fence add callback failed (%d)\n",
r);
dma_fence_put(fence);
- } else {
- if (s_fence->finished.error < 0)
- drm_sched_expel_job_unlocked(sched_job);
+ } else
drm_sched_process_job(NULL, &s_fence->cb);
- }
wake_up(&sched->job_scheduled);
}
return 0;
}
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job)
-{
- struct drm_gpu_scheduler *sched = s_job->sched;
-
- spin_lock(&sched->job_list_lock);
- list_del_init(&s_job->node);
- spin_unlock(&sched->job_list_lock);
-}
-
/**
* drm_sched_init - Init a gpu scheduler instance
*
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 445b2ef..f99346a 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -178,18 +178,19 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
for (q = 0; q < V3D_MAX_QUEUES; q++) {
struct drm_gpu_scheduler *sched = &v3d->queue[q].sched;
- kthread_park(sched->thread);
- drm_sched_hw_job_reset(sched, (sched_job->sched == sched ?
- sched_job : NULL));
+ drm_sched_stop(sched, (sched_job->sched == sched ?
+ sched_job : NULL), false);
}
/* get the GPU back into the init state */
v3d_reset(v3d);
+ for (q = 0; q < V3D_MAX_QUEUES; q++)
+ drm_sched_resubmit_jobs(sched_job->sched);
+
/* Unblock schedulers and restart their jobs. */
for (q = 0; q < V3D_MAX_QUEUES; q++) {
- drm_sched_job_recovery(&v3d->queue[q].sched);
- kthread_unpark(v3d->queue[q].sched.thread);
+ drm_sched_start(&v3d->queue[q].sched, false);
}
mutex_unlock(&v3d->reset_lock);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 47e1979..c94b592 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -175,6 +175,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
* finished to remove the job from the
* @drm_gpu_scheduler.ring_mirror_list.
* @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
+ * @finish_node: used in a list to wait on before resetting the scheduler
* @id: a unique id assigned to each job scheduled on the scheduler.
* @karma: increment on every hang caused by this job. If this exceeds the hang
* limit of the scheduler then the job is marked guilty and will not
@@ -193,6 +194,7 @@ struct drm_sched_job {
struct dma_fence_cb finish_cb;
struct work_struct finish_work;
struct list_head node;
+ struct list_head finish_node;
uint64_t id;
atomic_t karma;
enum drm_sched_priority s_priority;
@@ -298,9 +300,11 @@ int drm_sched_job_init(struct drm_sched_job *job,
void *owner);
void drm_sched_job_cleanup(struct drm_sched_job *job);
void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched,
- struct drm_sched_job *job);
-void drm_sched_job_recovery(struct drm_gpu_scheduler *sched);
+void drm_sched_stop(struct drm_gpu_scheduler *sched,
+ struct drm_sched_job *job,
+ bool park_only);
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool unpark_only);
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
bool drm_sched_dependency_optimized(struct dma_fence* fence,
struct drm_sched_entity *entity);
void drm_sched_fault(struct drm_gpu_scheduler *sched);