[PATCH v6 13/16] drm/panfrost: Don't reset the GPU on job faults unless we really have to

Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> · Wed, 30 Jun 2021 08:27:48 +0200

If we can recover from a fault without a reset there's no reason to
issue one.

v3:
* Drop the mention of Valhall requiring a reset on JOB_BUS_FAULT
* Set the fence error to -EINVAL instead of having per-exception
  error codes

Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx>
Reviewed-by: Steven Price <steven.price@xxxxxxx>
---
 drivers/gpu/drm/panfrost/panfrost_device.c |  9 +++++++++
 drivers/gpu/drm/panfrost/panfrost_device.h |  2 ++
 drivers/gpu/drm/panfrost/panfrost_job.c    | 16 ++++++++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index cd76d2ff5034..bd9b7be63b0f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -379,6 +379,15 @@ const char *panfrost_exception_name(u32 exception_code)
 	return panfrost_exception_infos[exception_code].name;
 }
 
+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
+				    u32 exception_code)
+{
+	/* Right now, none of the GPU we support need a reset, but this
+	 * might change.
+	 */
+	return false;
+}
+
 void panfrost_device_reset(struct panfrost_device *pfdev)
 {
 	panfrost_gpu_soft_reset(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index cb5aadf7ae90..68e93b7e5b61 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -245,6 +245,8 @@ enum drm_panfrost_exception_type {
 };
 
 const char *panfrost_exception_name(u32 exception_code);
+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
+				    u32 exception_code);
 
 static inline void
 panfrost_device_schedule_reset(struct panfrost_device *pfdev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 11ff33841caf..cf5f9e8b2a27 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -482,14 +482,26 @@ static void panfrost_job_handle_irq(struct panfrost_device *pfdev, u32 status)
 		job_write(pfdev, JOB_INT_CLEAR, mask);
 
 		if (status & JOB_INT_MASK_ERR(j)) {
+			u32 js_status = job_read(pfdev, JS_STATUS(j));
+
 			job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);
 
 			dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",
 				j,
-				panfrost_exception_name(job_read(pfdev, JS_STATUS(j))),
+				panfrost_exception_name(js_status),
 				job_read(pfdev, JS_HEAD_LO(j)),
 				job_read(pfdev, JS_TAIL_LO(j)));
-			drm_sched_fault(&pfdev->js->queue[j].sched);
+
+			/* If we need a reset, signal it to the timeout
+			 * handler, otherwise, update the fence error field and
+			 * signal the job fence.
+			 */
+			if (panfrost_exception_needs_reset(pfdev, js_status)) {
+				drm_sched_fault(&pfdev->js->queue[j].sched);
+			} else {
+				dma_fence_set_error(pfdev->jobs[j]->done_fence, -EINVAL);
+				status |= JOB_INT_MASK_DONE(j);
+			}
 		}
 
 		if (status & JOB_INT_MASK_DONE(j)) {
-- 
2.31.1