Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
drivers/gpu/drm/i915/i915_drv.c | 14 +-
drivers/gpu/drm/i915/i915_drv.h | 2 +-
drivers/gpu/drm/i915/i915_gem.c | 154 +++++++++-------------
drivers/gpu/drm/i915/i915_gem_context.c | 4 +
drivers/gpu/drm/i915/selftests/i915_gem.c | 9 +-
5 files changed, 73 insertions(+), 110 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 777a9a19414d..0d743907e7bc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -714,8 +714,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
return 0;
cleanup_gem:
- if (i915_gem_suspend(dev_priv))
- DRM_ERROR("failed to idle hardware; continuing to unload!\n");
+ i915_gem_suspend(dev_priv);
i915_gem_fini(dev_priv);
cleanup_modeset:
intel_modeset_cleanup(dev);
@@ -1900,8 +1899,7 @@ void i915_driver_unload(struct drm_device *dev)
/* Flush any external code that still may be under the RCU lock */
synchronize_rcu();
- if (i915_gem_suspend(dev_priv))
- DRM_ERROR("failed to idle hardware; continuing to unload!\n");
+ i915_gem_suspend(dev_priv);
drm_atomic_helper_shutdown(dev);
@@ -2009,7 +2007,6 @@ static bool suspend_to_idle(struct drm_i915_private *dev_priv)
static int i915_drm_prepare(struct drm_device *dev)
{
struct drm_i915_private *i915 = to_i915(dev);
- int err;
/*
* NB intel_display_suspend() may issue new requests after we've
@@ -2017,12 +2014,9 @@ static int i915_drm_prepare(struct drm_device *dev)
* split out that work and pull it forward so that after point,
* the GPU is not woken again.
*/
- err = i915_gem_suspend(i915);
- if (err)
- dev_err(&i915->drm.pdev->dev,
- "GEM idle failed, suspend/resume might fail\n");
+ i915_gem_suspend(i915);
- return err;
+ return 0;
}
static int i915_drm_suspend(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a5b314a0c415..b8a5281d8adf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3032,7 +3032,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv);
void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
unsigned int flags, long timeout);
-int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
+void i915_gem_suspend(struct drm_i915_private *dev_priv);
void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
void i915_gem_resume(struct drm_i915_private *dev_priv);
vm_fault_t i915_gem_fault(struct vm_fault *vmf);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index df2f4f65c2a4..f22de3b5a1f3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2828,13 +2828,6 @@ i915_gem_retire_work_handler(struct work_struct *work)
round_jiffies_up_relative(HZ));
}
-static inline bool
-new_requests_since_last_retire(const struct drm_i915_private *i915)
-{
- return (READ_ONCE(i915->gt.active_requests) ||
- work_pending(&i915->gt.idle_work.work));
-}
-
static void assert_kernel_context_is_current(struct drm_i915_private *i915)
{
struct intel_engine_cs *engine;
@@ -2843,7 +2836,8 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
if (i915_reset_failed(i915))
return;
- GEM_BUG_ON(i915->gt.active_requests);
+ i915_retire_requests(i915);
+
for_each_engine(engine, i915, id) {
GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
GEM_BUG_ON(engine->last_retired_context !=
@@ -2851,77 +2845,86 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
}
}
+static bool switch_to_kernel_context_sync(struct drm_i915_private *i915)
+{
+ bool result = true;
+
+ /*
+ * Even if we fail to switch, give whatever is running a small chance
+ * to save itself before we report the failure. Yes, this may be a
+ * false positive due to e.g. ENOMEM, caveat emptor!
+ */
+ if (i915_gem_switch_to_kernel_context(i915))
+ result = false;
+
+ if (i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED |
+ I915_WAIT_FOR_IDLE_BOOST,
+ I915_GEM_IDLE_TIMEOUT))
+ result = false;
+
+ if (result) {
+ assert_kernel_context_is_current(i915);
+ } else {
+ /* Forcibly cancel outstanding work and leave the gpu quiet. */
+ dev_err(i915->drm.dev,
+ "Failed to idle engines, declaring wedged!\n");
+ GEM_TRACE_DUMP();
+ i915_gem_set_wedged(i915);
+ }
+
+ i915_retire_requests(i915); /* ensure we flush after wedging */
+ return result;
+}
+
static void
i915_gem_idle_work_handler(struct work_struct *work)
{
- struct drm_i915_private *dev_priv =
- container_of(work, typeof(*dev_priv), gt.idle_work.work);
+ struct drm_i915_private *i915 =
+ container_of(work, typeof(*i915), gt.idle_work.work);
bool rearm_hangcheck;
- if (!READ_ONCE(dev_priv->gt.awake))
+ if (!READ_ONCE(i915->gt.awake))
return;
- if (READ_ONCE(dev_priv->gt.active_requests))
+ if (READ_ONCE(i915->gt.active_requests))
return;
- /*
- * Flush out the last user context, leaving only the pinned
- * kernel context resident. When we are idling on the kernel_context,
- * no more new requests (with a context switch) are emitted and we
- * can finally rest. A consequence is that the idle work handler is
- * always called at least twice before idling (and if the system is
- * idle that implies a round trip through the retire worker).
- */
- mutex_lock(&dev_priv->drm.struct_mutex);
- i915_gem_switch_to_kernel_context(dev_priv);
- mutex_unlock(&dev_priv->drm.struct_mutex);
-
- GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
- READ_ONCE(dev_priv->gt.active_requests));
-
- /*
- * Wait for last execlists context complete, but bail out in case a
- * new request is submitted. As we don't trust the hardware, we
- * continue on if the wait times out. This is necessary to allow
- * the machine to suspend even if the hardware dies, and we will
- * try to recover in resume (after depriving the hardware of power,
- * it may be in a better mmod).
- */
- __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
- intel_engines_are_idle(dev_priv),
- I915_IDLE_ENGINES_TIMEOUT * 1000,
- 10, 500);
-
rearm_hangcheck =
- cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
+ cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
- if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
+ if (!mutex_trylock(&i915->drm.struct_mutex)) {
/* Currently busy, come back later */
- mod_delayed_work(dev_priv->wq,
- &dev_priv->gt.idle_work,
+ mod_delayed_work(i915->wq,
+ &i915->gt.idle_work,
msecs_to_jiffies(50));
goto out_rearm;
}
/*
- * New request retired after this work handler started, extend active
- * period until next instance of the work.
+ * Flush out the last user context, leaving only the pinned
+ * kernel context resident. Should anything unfortunate happen
+ * while we are idle (such as the GPU being power cycled), no users
+ * will be harmed.
*/
- if (new_requests_since_last_retire(dev_priv))
- goto out_unlock;
+ if (!work_pending(&i915->gt.idle_work.work) &&
+ !i915->gt.active_requests) {
+ ++i915->gt.active_requests; /* don't requeue idle */
- __i915_gem_park(dev_priv);
+ switch_to_kernel_context_sync(i915);
- assert_kernel_context_is_current(dev_priv);
+ if (!--i915->gt.active_requests) {
+ __i915_gem_park(i915);
+ rearm_hangcheck = false;
+ }
+ }
- rearm_hangcheck = false;
-out_unlock:
- mutex_unlock(&dev_priv->drm.struct_mutex);
+ mutex_unlock(&i915->drm.struct_mutex);
out_rearm:
if (rearm_hangcheck) {
- GEM_BUG_ON(!dev_priv->gt.awake);
- i915_queue_hangcheck(dev_priv);
+ GEM_BUG_ON(!i915->gt.awake);
+ i915_queue_hangcheck(i915);
}
}
@@ -3128,7 +3131,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
return err;
i915_retire_requests(i915);
- GEM_BUG_ON(i915->gt.active_requests);
}
return 0;
@@ -4340,10 +4342,9 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
mutex_unlock(&i915->drm.struct_mutex);
}
-int i915_gem_suspend(struct drm_i915_private *i915)
+void i915_gem_suspend(struct drm_i915_private *i915)
{
intel_wakeref_t wakeref;
- int ret;
GEM_TRACE("\n");
@@ -4363,23 +4364,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
* state. Fortunately, the kernel_context is disposable and we do
* not rely on its state.
*/
- if (!i915_reset_failed(i915)) {
- ret = i915_gem_switch_to_kernel_context(i915);
- if (ret)
- goto err_unlock;
-
- ret = i915_gem_wait_for_idle(i915,
- I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED |
- I915_WAIT_FOR_IDLE_BOOST,
- I915_GEM_IDLE_TIMEOUT);
- if (ret == -EINTR)
- goto err_unlock;
-
- /* Forcibly cancel outstanding work and leave the gpu quiet. */
- i915_gem_set_wedged(i915);
- }
- i915_retire_requests(i915); /* ensure we flush after wedging */
+ switch_to_kernel_context_sync(i915);
mutex_unlock(&i915->drm.struct_mutex);
i915_reset_flush(i915);
@@ -4399,12 +4384,6 @@ int i915_gem_suspend(struct drm_i915_private *i915)
GEM_BUG_ON(i915->gt.awake);
intel_runtime_pm_put(i915, wakeref);
- return 0;
-
-err_unlock:
- mutex_unlock(&i915->drm.struct_mutex);
- intel_runtime_pm_put(i915, wakeref);
- return ret;
}
void i915_gem_suspend_late(struct drm_i915_private *i915)
@@ -4670,20 +4649,11 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
goto err_active;
}
- err = i915_gem_switch_to_kernel_context(i915);
- if (err)
- goto err_active;
-
- if (i915_gem_wait_for_idle(i915,
- I915_WAIT_LOCKED,
- I915_GEM_IDLE_TIMEOUT)) {
- i915_gem_set_wedged(i915);
+ if (!switch_to_kernel_context_sync(i915)) {
err = -EIO; /* Caller will declare us wedged */
goto err_active;
}
- assert_kernel_context_is_current(i915);
-
/*
* Immediately park the GPU so that we enable powersaving and
* treat it as idle. The next time we issue a request, we will
@@ -4927,7 +4897,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
err_init_hw:
mutex_unlock(&dev_priv->drm.struct_mutex);
- WARN_ON(i915_gem_suspend(dev_priv));
+ i915_gem_suspend(dev_priv);
i915_gem_suspend_late(dev_priv);
i915_gem_drain_workqueue(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index b9f321947982..9a3eb4f66d85 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -767,6 +767,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
lockdep_assert_held(&i915->drm.struct_mutex);
GEM_BUG_ON(!i915->kernel_context);
+ /* Inoperable, so presume the GPU is safely pointing into the void! */
+ if (i915_terminally_wedged(i915))
+ return 0;
+
i915_retire_requests(i915);
for_each_engine(engine, i915, id) {
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c
index e77b7ed449ae..50bb7bbd26d3 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem.c
@@ -84,14 +84,9 @@ static void simulate_hibernate(struct drm_i915_private *i915)
static int pm_prepare(struct drm_i915_private *i915)
{
- int err = 0;
-
- if (i915_gem_suspend(i915)) {
- pr_err("i915_gem_suspend failed\n");
- err = -EINVAL;
- }
+ i915_gem_suspend(i915);
- return err;
+ return 0;
}
static void pm_suspend(struct drm_i915_private *i915)