On Tue, 29 Jun 2021 09:35:03 +0200 Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> wrote: > @@ -379,57 +370,72 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev) > job_write(pfdev, JOB_INT_MASK, irq_mask); > } > > -static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue, > - struct drm_sched_job *bad) > +static void panfrost_reset(struct panfrost_device *pfdev, > + struct drm_sched_job *bad) > { > - enum panfrost_queue_status old_status; > - bool stopped = false; > + unsigned int i; > + bool cookie; > > - mutex_lock(&queue->lock); > - old_status = atomic_xchg(&queue->status, > - PANFROST_QUEUE_STATUS_STOPPED); > - if (old_status == PANFROST_QUEUE_STATUS_STOPPED) > - goto out; > + if (!atomic_read(&pfdev->reset.pending)) > + return; > + > + /* Stop the schedulers. > + * > + * FIXME: We temporarily get out of the dma_fence_signalling section > + * because the cleanup path generate lockdep splats when taking locks > + * to release job resources. We should rework the code to follow this > + * pattern: > + * > + * try_lock > + * if (locked) > + * release > + * else > + * schedule_work_to_release_later > + */ > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + drm_sched_stop(&pfdev->js->queue[i].sched, bad); > + > + cookie = dma_fence_begin_signalling(); > > - WARN_ON(old_status != PANFROST_QUEUE_STATUS_ACTIVE); > - drm_sched_stop(&queue->sched, bad); > if (bad) > drm_sched_increase_karma(bad); > > - stopped = true; > + spin_lock(&pfdev->js->job_lock); > + for (i = 0; i < NUM_JOB_SLOTS; i++) { > + if (pfdev->jobs[i]) { > + pm_runtime_put_noidle(pfdev->dev); > + panfrost_devfreq_record_idle(&pfdev->pfdevfreq); > + pfdev->jobs[i] = NULL; > + } > + } > + spin_unlock(&pfdev->js->job_lock); > > - /* > - * Set the timeout to max so the timer doesn't get started > - * when we return from the timeout handler (restored in > - * panfrost_scheduler_start()). > + panfrost_device_reset(pfdev); > + > + /* GPU has been reset, we can cancel timeout/fault work that may have > + * been queued in the meantime and clear the reset pending bit. > */ > - queue->sched.timeout = MAX_SCHEDULE_TIMEOUT; > + atomic_set(&pfdev->reset.pending, 0); > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + cancel_delayed_work(&pfdev->js->queue[i].sched.work_tdr); > Those cancel_delayed_work() calls are useless, drm_sched_stop() canceled those works already. I'll get rid of them in v6.