Re: [PATCH v3 09/15] drm/panfrost: Simplify the reset serialization logic

Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> · Fri, 25 Jun 2021 18:22:35 +0200

On Fri, 25 Jun 2021 15:33:21 +0200
Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> wrote:

> @@ -379,57 +370,73 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
>  	job_write(pfdev, JOB_INT_MASK, irq_mask);
>  }
>  
> -static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
> -				    struct drm_sched_job *bad)
> +static void panfrost_reset(struct panfrost_device *pfdev,
> +			   struct drm_sched_job *bad)
>  {
> -	enum panfrost_queue_status old_status;
> -	bool stopped = false;
> +	unsigned int i;
> +	bool cookie;
>  
> -	mutex_lock(&queue->lock);
> -	old_status = atomic_xchg(&queue->status,
> -				 PANFROST_QUEUE_STATUS_STOPPED);
> -	if (old_status == PANFROST_QUEUE_STATUS_STOPPED)
> -		goto out;
> +	if (WARN_ON(!atomic_read(&pfdev->reset.pending)))
> +		return;
> +
> +	/* Stop the schedulers.
> +	 *
> +	 * FIXME: We temporarily get out of the dma_fence_signalling section
> +	 * because the cleanup path generate lockdep splats when taking locks
> +	 * to release job resources. We should rework the code to follow this
> +	 * pattern:
> +	 *
> +	 *	try_lock
> +	 *	if (locked)
> +	 *		release
> +	 *	else
> +	 *		schedule_work_to_release_later
> +	 */
> +	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +		drm_sched_stop(&pfdev->js->queue[i].sched, bad);
> +
> +	cookie = dma_fence_begin_signalling();
>  
> -	WARN_ON(old_status != PANFROST_QUEUE_STATUS_ACTIVE);
> -	drm_sched_stop(&queue->sched, bad);
>  	if (bad)
>  		drm_sched_increase_karma(bad);
>  
> -	stopped = true;
> +	spin_lock(&pfdev->js->job_lock);
> +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> +		if (pfdev->jobs[i]) {
> +			pm_runtime_put_noidle(pfdev->dev);
> +			panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
> +			pfdev->jobs[i] = NULL;
> +		}
> +	}
> +	spin_unlock(&pfdev->js->job_lock);
>  
> -	/*
> -	 * Set the timeout to max so the timer doesn't get started
> -	 * when we return from the timeout handler (restored in
> -	 * panfrost_scheduler_start()).
> +	panfrost_device_reset(pfdev);
> +
> +	/* GPU has been reset, we can cancel timeout/fault work that may have
> +	 * been queued in the meantime and clear the reset pending bit.
>  	 */
> -	queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;
> +	atomic_set(&pfdev->reset.pending, 0);
> +	cancel_work_sync(&pfdev->reset.work);

This is introducing a deadlock since panfrost_reset() might be called
from the reset handler, and cancel_work_sync() waits for the handler to
return. Unfortunately there's no cancel_work() variant, so I'll just
remove the

	WARN_ON(!atomic_read(&pfdev->reset.pending)

and return directly when the pending bit is cleared.

> +	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +		cancel_delayed_work(&pfdev->js->queue[i].sched.work_tdr);
>  
> -out:
> -	mutex_unlock(&queue->lock);
>  
> -	return stopped;
> -}
> +	/* Now resubmit jobs that were previously queued but didn't have a
> +	 * chance to finish.
> +	 * FIXME: We temporarily get out of the DMA fence signalling section
> +	 * while resubmitting jobs because the job submission logic will
> +	 * allocate memory with the GFP_KERNEL flag which can trigger memory
> +	 * reclaim and exposes a lock ordering issue.
> +	 */
> +	dma_fence_end_signalling(cookie);
> +	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
> +	cookie = dma_fence_begin_signalling();
>  
> -static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
> -{
> -	enum panfrost_queue_status old_status;
> +	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +		drm_sched_start(&pfdev->js->queue[i].sched, true);
>  
> -	mutex_lock(&queue->lock);
> -	old_status = atomic_xchg(&queue->status,
> -				 PANFROST_QUEUE_STATUS_STARTING);
> -	WARN_ON(old_status != PANFROST_QUEUE_STATUS_STOPPED);
> -
> -	/* Restore the original timeout before starting the scheduler. */
> -	queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
> -	drm_sched_resubmit_jobs(&queue->sched);
> -	drm_sched_start(&queue->sched, true);
> -	old_status = atomic_xchg(&queue->status,
> -				 PANFROST_QUEUE_STATUS_ACTIVE);
> -	if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING)
> -		drm_sched_fault(&queue->sched);
> -
> -	mutex_unlock(&queue->lock);
> +	dma_fence_end_signalling(cookie);
>  }
>