On Fri, 25 Jun 2021 15:33:21 +0200 Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> wrote: > @@ -379,57 +370,73 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev) > job_write(pfdev, JOB_INT_MASK, irq_mask); > } > > -static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue, > - struct drm_sched_job *bad) > +static void panfrost_reset(struct panfrost_device *pfdev, > + struct drm_sched_job *bad) > { > - enum panfrost_queue_status old_status; > - bool stopped = false; > + unsigned int i; > + bool cookie; > > - mutex_lock(&queue->lock); > - old_status = atomic_xchg(&queue->status, > - PANFROST_QUEUE_STATUS_STOPPED); > - if (old_status == PANFROST_QUEUE_STATUS_STOPPED) > - goto out; > + if (WARN_ON(!atomic_read(&pfdev->reset.pending))) > + return; > + > + /* Stop the schedulers. > + * > + * FIXME: We temporarily get out of the dma_fence_signalling section > + * because the cleanup path generate lockdep splats when taking locks > + * to release job resources. We should rework the code to follow this > + * pattern: > + * > + * try_lock > + * if (locked) > + * release > + * else > + * schedule_work_to_release_later > + */ > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + drm_sched_stop(&pfdev->js->queue[i].sched, bad); > + > + cookie = dma_fence_begin_signalling(); > > - WARN_ON(old_status != PANFROST_QUEUE_STATUS_ACTIVE); > - drm_sched_stop(&queue->sched, bad); > if (bad) > drm_sched_increase_karma(bad); > > - stopped = true; > + spin_lock(&pfdev->js->job_lock); > + for (i = 0; i < NUM_JOB_SLOTS; i++) { > + if (pfdev->jobs[i]) { > + pm_runtime_put_noidle(pfdev->dev); > + panfrost_devfreq_record_idle(&pfdev->pfdevfreq); > + pfdev->jobs[i] = NULL; > + } > + } > + spin_unlock(&pfdev->js->job_lock); > > - /* > - * Set the timeout to max so the timer doesn't get started > - * when we return from the timeout handler (restored in > - * panfrost_scheduler_start()). > + panfrost_device_reset(pfdev); > + > + /* GPU has been reset, we can cancel timeout/fault work that may have > + * been queued in the meantime and clear the reset pending bit. > */ > - queue->sched.timeout = MAX_SCHEDULE_TIMEOUT; > + atomic_set(&pfdev->reset.pending, 0); > + cancel_work_sync(&pfdev->reset.work); This is introducing a deadlock since panfrost_reset() might be called from the reset handler, and cancel_work_sync() waits for the handler to return. Unfortunately there's no cancel_work() variant, so I'll just remove the WARN_ON(!atomic_read(&pfdev->reset.pending) and return directly when the pending bit is cleared. > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + cancel_delayed_work(&pfdev->js->queue[i].sched.work_tdr); > > -out: > - mutex_unlock(&queue->lock); > > - return stopped; > -} > + /* Now resubmit jobs that were previously queued but didn't have a > + * chance to finish. > + * FIXME: We temporarily get out of the DMA fence signalling section > + * while resubmitting jobs because the job submission logic will > + * allocate memory with the GFP_KERNEL flag which can trigger memory > + * reclaim and exposes a lock ordering issue. > + */ > + dma_fence_end_signalling(cookie); > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched); > + cookie = dma_fence_begin_signalling(); > > -static void panfrost_scheduler_start(struct panfrost_queue_state *queue) > -{ > - enum panfrost_queue_status old_status; > + for (i = 0; i < NUM_JOB_SLOTS; i++) > + drm_sched_start(&pfdev->js->queue[i].sched, true); > > - mutex_lock(&queue->lock); > - old_status = atomic_xchg(&queue->status, > - PANFROST_QUEUE_STATUS_STARTING); > - WARN_ON(old_status != PANFROST_QUEUE_STATUS_STOPPED); > - > - /* Restore the original timeout before starting the scheduler. */ > - queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS); > - drm_sched_resubmit_jobs(&queue->sched); > - drm_sched_start(&queue->sched, true); > - old_status = atomic_xchg(&queue->status, > - PANFROST_QUEUE_STATUS_ACTIVE); > - if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING) > - drm_sched_fault(&queue->sched); > - > - mutex_unlock(&queue->lock); > + dma_fence_end_signalling(cookie); > } >