Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > If the preempted context takes too long to relinquish control, e.g. it > is stuck inside a shader with arbitration disabled, evict that context > with an engine reset. This ensures that preemptions are reasonably > responsive, providing a tighter QoS for the more important context at > the cost of flagging unresponsive contexts more frequently (i.e. instead > of using an ~10s hangcheck, we now evict at ~10ms). The challenge of > lies in picking a timeout that can be reasonably serviced by HW for > typical workloads, balancing the existing clients against the needs for > responsiveness. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/Kconfig.profile | 12 ++++++ > drivers/gpu/drm/i915/gt/intel_lrc.c | 56 ++++++++++++++++++++++++++-- > 2 files changed, 65 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile > index 48df8889a88a..8273d3baafe4 100644 > --- a/drivers/gpu/drm/i915/Kconfig.profile > +++ b/drivers/gpu/drm/i915/Kconfig.profile > @@ -25,3 +25,15 @@ config DRM_I915_SPIN_REQUEST > May be 0 to disable the initial spin. In practice, we estimate > the cost of enabling the interrupt (if currently disabled) to be > a few microseconds. > + > +config DRM_I915_PREEMPT_TIMEOUT > + int "Preempt timeout (ms)" > + default 10 # milliseconds > + help > + How long to wait (in milliseconds) for a preemption event to occur > + when submitting a new context via execlists. If the current context > + does not hit an arbitration point and yield to HW before the timer > + expires, the HW will be reset to allow the more important context > + to execute. > + > + May be 0 to disable the timeout. > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index fca79adb4aa3..e8d7deba3e49 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -889,6 +889,15 @@ enable_timeslice(struct intel_engine_cs *engine) > return last && need_timeslice(engine, last); > } > > +static unsigned long preempt_expires(void) > +{ > + unsigned long timeout = could be const > + msecs_to_jiffies_timeout(CONFIG_DRM_I915_PREEMPT_TIMEOUT); > + > + barrier(); This needs a comment. I fail to connect the dots as jiffies is volatile by nature. > + return jiffies + timeout; > +} > + > static void execlists_dequeue(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > @@ -1220,6 +1229,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > *port = execlists_schedule_in(last, port - execlists->pending); > memset(port + 1, 0, (last_port - port) * sizeof(*port)); > execlists_submit_ports(engine); > + > + if (CONFIG_DRM_I915_PREEMPT_TIMEOUT) > + mod_timer(&execlists->timer, preempt_expires()); > } > } > > @@ -1376,13 +1388,48 @@ static void process_csb(struct intel_engine_cs *engine) > invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); > } > > -static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) > +static bool __execlists_submission_tasklet(struct intel_engine_cs *const engine) > { > lockdep_assert_held(&engine->active.lock); > > process_csb(engine); > - if (!engine->execlists.pending[0]) > + if (!engine->execlists.pending[0]) { > execlists_dequeue(engine); > + return true; > + } > + > + return false; > +} > + > +static void preempt_reset(struct intel_engine_cs *engine) > +{ > + const unsigned int bit = I915_RESET_ENGINE + engine->id; > + unsigned long *lock = &engine->i915->gpu_error.flags; > + > + if (test_and_set_bit(bit, lock)) > + return; > + > + tasklet_disable_nosync(&engine->execlists.tasklet); > + spin_unlock(&engine->active.lock); > + Why do we need to drop the lock? -Mika > + i915_reset_engine(engine, "preemption time out"); > + > + spin_lock(&engine->active.lock); > + tasklet_enable(&engine->execlists.tasklet); > + > + clear_bit(bit, lock); > + wake_up_bit(lock, bit); > +} > + > +static bool preempt_timeout(struct intel_engine_cs *const engine) > +{ > + if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) > + return false; > + > + if (!intel_engine_has_preemption(engine)) > + return false; > + > + return !timer_pending(&engine->execlists.timer); > } > > /* > @@ -1395,7 +1442,10 @@ static void execlists_submission_tasklet(unsigned long data) > unsigned long flags; > > spin_lock_irqsave(&engine->active.lock, flags); > - __execlists_submission_tasklet(engine); > + > + if (!__execlists_submission_tasklet(engine) && preempt_timeout(engine)) > + preempt_reset(engine); > + > spin_unlock_irqrestore(&engine->active.lock, flags); > } > > -- > 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx