On Fri, Dec 11, 2015 at 11:33:05AM +0000, Chris Wilson wrote: > Reporting -EIO from i915_wait_request() has proven very troublematic > over the years, with numerous hard-to-reproduce bugs cropping up in the > corner case of where a reset occurs and the code wasn't expecting such > an error. > > If the we reset the GPU or have detected a hang and wish to reset the > GPU, the request is forcibly complete and the wait broken. Currently, we > report either -EAGAIN or -EIO in order for the caller to retreat and > restart the wait (if appropriate) after dropping and then reacquiring > the struct_mutex (essential to allow the GPU reset to proceed). However, > if we take the view that the request is complete (no further work will > be done on it by the GPU because it is dead and soon to be reset), then > we can proceed with the task at hand and then drop the struct_mutex > allowing the reset to occur. This transfers the burden of checking > whether it is safe to proceed to the caller, which in all but one > instance it is safe - completely eliminating the source of all spurious > -EIO. > > Of note, we only have two API entry points where we expect that > userspace can observe an EIO. First is when submitting an execbuf, if > the GPU is terminally wedged, then the operation cannot succeed and an > -EIO is reported. Secondly, existing userspace uses the throttle ioctl > to detect an already wedged GPU before starting using HW acceleration > (or to confirm that the GPU is wedged after an error condition). So if > the GPU is wedged when the user calls throttle, also report -EIO. > > v2: Split more carefully the change to i915_wait_request() and assorted > ABI from the reset handling. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Daniel Vetter <daniel.vetter@xxxxxxxx> > --- > drivers/gpu/drm/i915/i915_drv.h | 2 -- > drivers/gpu/drm/i915/i915_gem.c | 44 ++++++++++++++++----------------- > drivers/gpu/drm/i915/i915_gem_userptr.c | 6 ++--- > drivers/gpu/drm/i915/intel_display.c | 8 +----- > drivers/gpu/drm/i915/intel_lrc.c | 2 +- > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 +- > 6 files changed, 27 insertions(+), 37 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index f30c305a6889..7acbc072973a 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2987,8 +2987,6 @@ i915_gem_find_active_request(struct intel_engine_cs *ring); > > bool i915_gem_retire_requests(struct drm_device *dev); > void i915_gem_retire_requests_ring(struct intel_engine_cs *ring); > -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, > - bool interruptible); > > static inline u32 i915_reset_counter(struct i915_gpu_error *error) > { > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index b17cc0e42a4f..f5760869a17c 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -208,11 +208,10 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj) > BUG_ON(obj->madv == __I915_MADV_PURGED); > > ret = i915_gem_object_set_to_cpu_domain(obj, true); > - if (ret) { > + if (WARN_ON(ret)) { > /* In the event of a disaster, abandon all caches and > * hope for the best. > */ > - WARN_ON(ret != -EIO); > obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; > } > > @@ -1106,15 +1105,13 @@ put_rpm: > return ret; > } > > -int > -i915_gem_check_wedge(struct i915_gpu_error *error, > - bool interruptible) > +static int > +i915_gem_check_wedge(unsigned reset_counter, bool interruptible) > { > - if (i915_reset_in_progress_or_wedged(error)) { > - /* Recovery complete, but the reset failed ... */ > - if (i915_terminally_wedged(error)) > - return -EIO; > + if (__i915_terminally_wedged(reset_counter)) > + return -EIO; > > + if (__i915_reset_in_progress(reset_counter)) { > /* Non-interruptible callers can't handle -EAGAIN, hence return > * -EIO unconditionally for these. */ > if (!interruptible) > @@ -1285,13 +1282,14 @@ int __i915_wait_request(struct drm_i915_gem_request *req, > prepare_to_wait(&ring->irq_queue, &wait, state); > > /* We need to check whether any gpu reset happened in between > - * the caller grabbing the seqno and now ... */ > + * the request being submitted and now. If a reset has occurred, > + * the request is effectively complete (we either are in the > + * process of or have discarded the rendering and completely > + * reset the GPU. The results of the request are lost and we > + * are free to continue on with the original operation. > + */ > if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) { > - /* ... but upgrade the -EAGAIN to an -EIO if the gpu > - * is truely gone. */ > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > - if (ret == 0) > - ret = -EAGAIN; > + ret = 0; > break; > } > > @@ -2154,11 +2152,10 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj) > BUG_ON(obj->madv == __I915_MADV_PURGED); > > ret = i915_gem_object_set_to_cpu_domain(obj, true); > - if (ret) { > + if (WARN_ON(ret)) { > /* In the event of a disaster, abandon all caches and > * hope for the best. > */ > - WARN_ON(ret != -EIO); > i915_gem_clflush_object(obj, true); > obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; > } > @@ -2678,8 +2675,11 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring, > > *req_out = NULL; > > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, > - dev_priv->mm.interruptible); > + /* ABI: Before userspace accesses the GPU (e.g. execbuffer), report > + * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex > + * and restart. > + */ > + ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible); > if (ret) > return ret; > > @@ -4093,9 +4093,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) > if (ret) > return ret; > > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, false); > - if (ret) > - return ret; > + /* ABI: return -EIO if already wedged */ > + if (i915_terminally_wedged(&dev_priv->gpu_error)) > + return -EIO; > > spin_lock(&file_priv->mm.lock); > list_for_each_entry(request, &file_priv->mm.request_list, client_list) { > diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c > index 19fb0bddc1cd..1a5f89dba4af 100644 > --- a/drivers/gpu/drm/i915/i915_gem_userptr.c > +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c > @@ -81,10 +81,8 @@ static void __cancel_userptr__worker(struct work_struct *work) > was_interruptible = dev_priv->mm.interruptible; > dev_priv->mm.interruptible = false; > > - list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link) { > - int ret = i915_vma_unbind(vma); > - WARN_ON(ret && ret != -EIO); > - } > + list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link) > + WARN_ON(i915_vma_unbind(vma)); > WARN_ON(i915_gem_object_put_pages(obj)); > > dev_priv->mm.interruptible = was_interruptible; > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > index d7bbd015de35..875bdf814d73 100644 > --- a/drivers/gpu/drm/i915/intel_display.c > +++ b/drivers/gpu/drm/i915/intel_display.c > @@ -13405,10 +13405,6 @@ static int intel_atomic_prepare_commit(struct drm_device *dev, > > ret = __i915_wait_request(intel_plane_state->wait_req, > true, NULL, NULL); > - > - /* Swallow -EIO errors to allow updates during hw lockup. */ > - if (ret == -EIO) I'd like to keep a WARN_ON here since it's ABI relevant, and consistent with the other places we've had a WARN_ON(-EIO). > - ret = 0; > if (ret) { > mutex_lock(&dev->struct_mutex); > drm_atomic_helper_cleanup_planes(dev, state); > @@ -13744,9 +13740,7 @@ intel_prepare_plane_fb(struct drm_plane *plane, > */ > if (needs_modeset(crtc_state)) > ret = i915_gem_object_wait_rendering(old_obj, true); > - > - /* Swallow -EIO errors to allow updates during hw lockup. */ > - if (ret && ret != -EIO) > + if (ret) Same here really. We already have a WARN_ON in the mmio_flip, so are covered there already. Otherwise looks good, and with the above addressed and under the assumption that the last caller of check_wedge is in request_alloc (which I was too lazy to check perfectly): Reviewed-by: Daniel Vetter <daniel.vetter@xxxxxxxx> > return ret; > } > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 69b298eed69d..7030ff526941 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -976,7 +976,7 @@ void intel_logical_ring_stop(struct intel_engine_cs *ring) > return; > > ret = intel_ring_idle(ring); > - if (ret && !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error)) > + if (ret) > DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n", > ring->name, ret); > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 942f86b316c2..6cecc15ec01b 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -3062,7 +3062,7 @@ intel_stop_ring_buffer(struct intel_engine_cs *ring) > return; > > ret = intel_ring_idle(ring); > - if (ret && !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error)) > + if (ret) > DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n", > ring->name, ret); > > -- > 2.6.3 > -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx