On 21/06/2021 14:39, Boris Brezillon wrote: > If we can recover from a fault without a reset there's no reason to > issue one. > > Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> > --- > drivers/gpu/drm/panfrost/panfrost_device.c | 9 ++++++ > drivers/gpu/drm/panfrost/panfrost_device.h | 2 ++ > drivers/gpu/drm/panfrost/panfrost_job.c | 35 ++++++++++++++-------- > 3 files changed, 34 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c > index 2de011cee258..ac76e8646e97 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_device.c > +++ b/drivers/gpu/drm/panfrost/panfrost_device.c > @@ -383,6 +383,15 @@ int panfrost_exception_to_error(u32 exception_code) > return panfrost_exception_infos[exception_code].error; > } > > +bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, > + u32 exception_code) > +{ > + /* Right now, none of the GPU we support need a reset, but this > + * might change (e.g. Valhall GPUs require a when a BUS_FAULT occurs). NITs: ^ some ^ reset Or just drop the example for now. > + */ > + return false; > +} > + > void panfrost_device_reset(struct panfrost_device *pfdev) > { > panfrost_gpu_soft_reset(pfdev); > diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h > index 498c7b5dccd0..95e6044008d2 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_device.h > +++ b/drivers/gpu/drm/panfrost/panfrost_device.h > @@ -175,6 +175,8 @@ int panfrost_device_suspend(struct device *dev); > > const char *panfrost_exception_name(u32 exception_code); > int panfrost_exception_to_error(u32 exception_code); > +bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, > + u32 exception_code); > > static inline void > panfrost_device_schedule_reset(struct panfrost_device *pfdev) > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c > index be5d3e4a1d0a..aedc604d331c 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > @@ -493,27 +493,38 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) > > if (status & JOB_INT_MASK_ERR(j)) { > enum panfrost_queue_status old_status; > + u32 js_status = job_read(pfdev, JS_STATUS(j)); > > job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP); > > dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", > j, > - panfrost_exception_name(job_read(pfdev, JS_STATUS(j))), > + panfrost_exception_name(js_status), > job_read(pfdev, JS_HEAD_LO(j)), > job_read(pfdev, JS_TAIL_LO(j))); > > - /* > - * When the queue is being restarted we don't report > - * faults directly to avoid races between the timeout > - * and reset handlers. panfrost_scheduler_start() will > - * call drm_sched_fault() after the queue has been > - * started if status == FAULT_PENDING. > + /* If we need a reset, signal it to the reset handler, > + * otherwise, update the fence error field and signal > + * the job fence. > */ > - old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, > - PANFROST_QUEUE_STATUS_STARTING, > - PANFROST_QUEUE_STATUS_FAULT_PENDING); > - if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) > - drm_sched_fault(&pfdev->js->queue[j].sched); > + if (panfrost_exception_needs_reset(pfdev, js_status)) { > + /* > + * When the queue is being restarted we don't report > + * faults directly to avoid races between the timeout > + * and reset handlers. panfrost_scheduler_start() will > + * call drm_sched_fault() after the queue has been > + * started if status == FAULT_PENDING. > + */ > + old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, > + PANFROST_QUEUE_STATUS_STARTING, > + PANFROST_QUEUE_STATUS_FAULT_PENDING); > + if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) > + drm_sched_fault(&pfdev->js->queue[j].sched); > + } else { > + dma_fence_set_error(pfdev->jobs[j]->done_fence, > + panfrost_exception_to_error(js_status)); As in the previous patch - at the moment a status of STOPPED or TERMINATED shouldn't actually happen. But the next patch is about to change that! TERMINATED should definitely cause an error on the fence. Steve > + status |= JOB_INT_MASK_DONE(j); > + } > } > > if (status & JOB_INT_MASK_DONE(j)) { >