When the system idles, we switch to the kernel context as a defensive measure (no users are harmed if the kernel context is lost). Currently, we issue a switch to kernel context and then come back later to see if the kernel context is still current and the system is idle. However, if we are no longer privy to the runqueue ordering, then we have to relax our assumptions about the logical state of the GPU and the only way to ensure that the kernel context is currently loaded is by issuing a request to run after all others, and wait for it to complete all while preventing anyone else from issuing their own requests. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.c | 14 +-- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 143 ++++++++-------------- drivers/gpu/drm/i915/i915_gem_context.c | 4 + drivers/gpu/drm/i915/selftests/i915_gem.c | 9 +- 5 files changed, 65 insertions(+), 107 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index f5a3558e00fd..36da8ab1e7ce 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -712,8 +712,7 @@ static int i915_load_modeset_init(struct drm_device *dev) return 0; cleanup_gem: - if (i915_gem_suspend(dev_priv)) - DRM_ERROR("failed to idle hardware; continuing to unload!\n"); + i915_gem_suspend(dev_priv); i915_gem_fini(dev_priv); cleanup_modeset: intel_modeset_cleanup(dev); @@ -1784,8 +1783,7 @@ void i915_driver_unload(struct drm_device *dev) /* Flush any external code that still may be under the RCU lock */ synchronize_rcu(); - if (i915_gem_suspend(dev_priv)) - DRM_ERROR("failed to idle hardware; continuing to unload!\n"); + i915_gem_suspend(dev_priv); drm_atomic_helper_shutdown(dev); @@ -1893,7 +1891,6 @@ static bool suspend_to_idle(struct drm_i915_private *dev_priv) static int i915_drm_prepare(struct drm_device *dev) { struct drm_i915_private *i915 = to_i915(dev); - int err; /* * NB intel_display_suspend() may issue new requests after we've @@ -1901,12 +1898,9 @@ static int i915_drm_prepare(struct drm_device *dev) * split out that work and pull it forward so that after point, * the GPU is not woken again. */ - err = i915_gem_suspend(i915); - if (err) - dev_err(&i915->drm.pdev->dev, - "GEM idle failed, suspend/resume might fail\n"); + i915_gem_suspend(i915); - return err; + return 0; } static int i915_drm_suspend(struct drm_device *dev) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 4d697b1002af..8a72dad9471f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3031,7 +3031,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv); void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv); int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv, unsigned int flags, long timeout); -int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv); +void i915_gem_suspend(struct drm_i915_private *dev_priv); void i915_gem_suspend_late(struct drm_i915_private *dev_priv); void i915_gem_resume(struct drm_i915_private *dev_priv); vm_fault_t i915_gem_fault(struct vm_fault *vmf); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 89b2d3ac26ce..43bc26d5807a 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2930,13 +2930,6 @@ static void __sleep_rcu(struct rcu_head *rcu) } } -static inline bool -new_requests_since_last_retire(const struct drm_i915_private *i915) -{ - return (READ_ONCE(i915->gt.active_requests) || - work_pending(&i915->gt.idle_work.work)); -} - static void assert_kernel_context_is_current(struct drm_i915_private *i915) { struct intel_engine_cs *engine; @@ -2945,7 +2938,8 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915) if (i915_terminally_wedged(&i915->gpu_error)) return; - GEM_BUG_ON(i915->gt.active_requests); + i915_retire_requests(i915); + for_each_engine(engine, i915, id) { GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request)); GEM_BUG_ON(engine->last_retired_context != @@ -2953,78 +2947,76 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915) } } +static bool switch_to_kernel_context_sync(struct drm_i915_private *i915) +{ + if (i915_gem_switch_to_kernel_context(i915)) + return false; + + if (i915_gem_wait_for_idle(i915, + I915_WAIT_LOCKED | + I915_WAIT_FOR_IDLE_BOOST, + HZ / 10)) + return false; + + assert_kernel_context_is_current(i915); + return true; +} + static void i915_gem_idle_work_handler(struct work_struct *work) { - struct drm_i915_private *dev_priv = - container_of(work, typeof(*dev_priv), gt.idle_work.work); + struct drm_i915_private *i915 = + container_of(work, typeof(*i915), gt.idle_work.work); + typeof(i915->gt) *gt = &i915->gt; unsigned int epoch = I915_EPOCH_INVALID; bool rearm_hangcheck; - if (!READ_ONCE(dev_priv->gt.awake)) + if (!READ_ONCE(gt->awake)) return; - if (READ_ONCE(dev_priv->gt.active_requests)) + if (READ_ONCE(gt->active_requests)) return; - /* - * Flush out the last user context, leaving only the pinned - * kernel context resident. When we are idling on the kernel_context, - * no more new requests (with a context switch) are emitted and we - * can finally rest. A consequence is that the idle work handler is - * always called at least twice before idling (and if the system is - * idle that implies a round trip through the retire worker). - */ - mutex_lock(&dev_priv->drm.struct_mutex); - i915_gem_switch_to_kernel_context(dev_priv); - mutex_unlock(&dev_priv->drm.struct_mutex); - - GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n", - READ_ONCE(dev_priv->gt.active_requests)); - - /* - * Wait for last execlists context complete, but bail out in case a - * new request is submitted. As we don't trust the hardware, we - * continue on if the wait times out. This is necessary to allow - * the machine to suspend even if the hardware dies, and we will - * try to recover in resume (after depriving the hardware of power, - * it may be in a better mmod). - */ - __wait_for(if (new_requests_since_last_retire(dev_priv)) return, - intel_engines_are_idle(dev_priv), - I915_IDLE_ENGINES_TIMEOUT * 1000, - 10, 500); - rearm_hangcheck = - cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work); + cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work); - if (!mutex_trylock(&dev_priv->drm.struct_mutex)) { + if (!mutex_trylock(&i915->drm.struct_mutex)) { /* Currently busy, come back later */ - mod_delayed_work(dev_priv->wq, - &dev_priv->gt.idle_work, + mod_delayed_work(i915->wq, + >->idle_work, msecs_to_jiffies(50)); goto out_rearm; } /* - * New request retired after this work handler started, extend active - * period until next instance of the work. + * Flush out the last user context, leaving only the pinned + * kernel context resident. Should anything unfortunate happen + * while we are idle (such as the GPU being power cycled), no users + * will be harmed. */ - if (new_requests_since_last_retire(dev_priv)) - goto out_unlock; - - epoch = __i915_gem_park(dev_priv); + if (!gt->active_requests && !work_pending(>->idle_work.work)) { + ++gt->active_requests; /* don't requeue idle */ + + if (!switch_to_kernel_context_sync(i915)) { + dev_err(i915->drm.dev, + "Failed to idle engines, declaring wedged!\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + } + i915_retire_requests(i915); - assert_kernel_context_is_current(dev_priv); + if (!--gt->active_requests) { + epoch = __i915_gem_park(i915); + rearm_hangcheck = false; + } + } - rearm_hangcheck = false; -out_unlock: - mutex_unlock(&dev_priv->drm.struct_mutex); + mutex_unlock(&i915->drm.struct_mutex); out_rearm: if (rearm_hangcheck) { - GEM_BUG_ON(!dev_priv->gt.awake); - i915_queue_hangcheck(dev_priv); + GEM_BUG_ON(!gt->awake); + i915_queue_hangcheck(i915); } /* @@ -3035,11 +3027,11 @@ i915_gem_idle_work_handler(struct work_struct *work) * period, and then queue a task (that will run last on the wq) to * shrink and re-optimize the caches. */ - if (same_epoch(dev_priv, epoch)) { + if (same_epoch(i915, epoch)) { struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL); if (s) { init_rcu_head(&s->rcu); - s->i915 = dev_priv; + s->i915 = i915; s->epoch = epoch; call_rcu(&s->rcu, __sleep_rcu); } @@ -3249,7 +3241,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, return err; i915_retire_requests(i915); - GEM_BUG_ON(i915->gt.active_requests); } return 0; @@ -4458,10 +4449,9 @@ void i915_gem_sanitize(struct drm_i915_private *i915) mutex_unlock(&i915->drm.struct_mutex); } -int i915_gem_suspend(struct drm_i915_private *i915) +void i915_gem_suspend(struct drm_i915_private *i915) { intel_wakeref_t wakeref; - int ret; GEM_TRACE("\n"); @@ -4481,19 +4471,7 @@ int i915_gem_suspend(struct drm_i915_private *i915) * state. Fortunately, the kernel_context is disposable and we do * not rely on its state. */ - if (!i915_terminally_wedged(&i915->gpu_error)) { - ret = i915_gem_switch_to_kernel_context(i915); - if (ret) - goto err_unlock; - - ret = i915_gem_wait_for_idle(i915, - I915_WAIT_INTERRUPTIBLE | - I915_WAIT_LOCKED | - I915_WAIT_FOR_IDLE_BOOST, - HZ / 5); - if (ret == -EINTR) - goto err_unlock; - + if (!switch_to_kernel_context_sync(i915)) { /* Forcibly cancel outstanding work and leave the gpu quiet. */ i915_gem_set_wedged(i915); } @@ -4517,12 +4495,6 @@ int i915_gem_suspend(struct drm_i915_private *i915) GEM_BUG_ON(i915->gt.awake); intel_runtime_pm_put(i915, wakeref); - return 0; - -err_unlock: - mutex_unlock(&i915->drm.struct_mutex); - intel_runtime_pm_put(i915, wakeref); - return ret; } void i915_gem_suspend_late(struct drm_i915_private *i915) @@ -4788,18 +4760,11 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915) goto err_active; } - err = i915_gem_switch_to_kernel_context(i915); - if (err) - goto err_active; - - if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) { - i915_gem_set_wedged(i915); + if (!switch_to_kernel_context_sync(i915)) { err = -EIO; /* Caller will declare us wedged */ goto err_active; } - assert_kernel_context_is_current(i915); - /* * Immediately park the GPU so that we enable powersaving and * treat it as idle. The next time we issue a request, we will @@ -5043,7 +5008,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv) err_init_hw: mutex_unlock(&dev_priv->drm.struct_mutex); - WARN_ON(i915_gem_suspend(dev_priv)); + i915_gem_suspend(dev_priv); i915_gem_suspend_late(dev_priv); i915_gem_drain_workqueue(dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 280813a4bf82..1bdb067845f2 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -745,6 +745,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915) lockdep_assert_held(&i915->drm.struct_mutex); GEM_BUG_ON(!i915->kernel_context); + /* Inoperable, so presume the GPU is safely pointing into the void! */ + if (i915_terminally_wedged(&i915->gpu_error)) + return 0; + i915_retire_requests(i915); for_each_engine(engine, i915, id) { diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c index e77b7ed449ae..50bb7bbd26d3 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem.c @@ -84,14 +84,9 @@ static void simulate_hibernate(struct drm_i915_private *i915) static int pm_prepare(struct drm_i915_private *i915) { - int err = 0; - - if (i915_gem_suspend(i915)) { - pr_err("i915_gem_suspend failed\n"); - err = -EINVAL; - } + i915_gem_suspend(i915); - return err; + return 0; } static void pm_suspend(struct drm_i915_private *i915) -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx