Quoting jeff.mcgee@xxxxxxxxx (2018-03-16 18:30:57) > From: Jeff McGee <jeff.mcgee@xxxxxxxxx> > > Force preemption uses engine reset to enforce a limit on the time > that a request targeted for preemption can block. This feature is > a requirement in automotive systems where the GPU may be shared by > clients of critically high priority and clients of low priority that > may not have been curated to be preemption friendly. There may be > more general applications of this feature. I'm sharing as an RFC to > stimulate that discussion and also to get any technical feedback > that I can before submitting to the product kernel that needs this. > I have developed the patches for ease of rebase, given that this is > for the moment considered a non-upstreamable feature. It would be > possible to refactor hangcheck to fully incorporate force preemption > as another tier of patience (or impatience) with the running request. Last night I spent 15mins and wrote a similar RFC as Joonas keep muttering and I thought he was looking for code not that was some... The real only difference is the approach to handling the pending timer. I do not agree with your reset changes as they appear to percolate even more uncertainity through the code. If the preempt-context is in flight when the reset is triggered, the last active request is still defined by incomplete request; the new request has not been submitted. This is serialised by checking for EXECLISTS_ACTIVE_PREEMPT. Up until the point that is cleared and the next request submitted; we should reset the engine and resubmit. The race in tasklet handling is immaterial at that point, you have already paid the cost in using a timer, the workqueue and serialising for the reset. diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 76ea1da923bd..d9da68efbc86 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3969,8 +3969,8 @@ i915_wedged_set(void *data, u64 val) engine->hangcheck.stalled = true; } - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", - val); + i915_handle_error(i915, val, I915_ERROR_CAPTURE, + "Manually set wedged engine mask = %llx", val); wait_on_bit(&i915->gpu_error.flags, I915_RESET_HANDOFF, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 26c1dd4b542e..d817b5e55c96 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2755,10 +2755,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) &dev_priv->gpu_error.hangcheck_work, delay); } -__printf(3, 4) +__printf(4, 5) void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...); +#define I915_ERROR_CAPTURE BIT(0) extern void intel_irq_init(struct drm_i915_private *dev_priv); extern void intel_irq_fini(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 5aaf356fd27f..0a255ffe4c51 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2593,6 +2593,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) * i915_handle_error - handle a gpu error * @dev_priv: i915 device private * @engine_mask: mask representing engines that are hung + * @flags: control flags * @fmt: Error message format string * * Do some basic checking of register state at error time and @@ -2603,16 +2604,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) */ void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...) { struct intel_engine_cs *engine; unsigned int tmp; - va_list args; - char error_msg[80]; - - va_start(args, fmt); - vscnprintf(error_msg, sizeof(error_msg), fmt, args); - va_end(args); /* * In most cases it's guaranteed that we get here with an RPM @@ -2624,8 +2620,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv, intel_runtime_pm_get(dev_priv); engine_mask &= DEVICE_INFO(dev_priv)->ring_mask; - i915_capture_error_state(dev_priv, engine_mask, error_msg); - i915_clear_error_registers(dev_priv); + + if (flags & I915_ERROR_CAPTURE) { + char error_msg[80]; + va_list args; + + va_start(args, fmt); + vscnprintf(error_msg, sizeof(error_msg), fmt, args); + va_end(args); + + i915_capture_error_state(dev_priv, engine_mask, error_msg); + i915_clear_error_registers(dev_priv); + } /* * Try engine reset when available. We fall back to full reset if diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 42e45ae87393..13d1a269c771 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) */ tmp = I915_READ_CTL(engine); if (tmp & RING_WAIT) { - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, 0, 0, "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) default: return ENGINE_DEAD; case 1: - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, 0, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, "%s, ", engine->name); msg[len-2] = '\0'; - return i915_handle_error(i915, hung, "%s", msg); + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); } /* diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 396abef55dd3..55f25241a199 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -529,8 +529,35 @@ static void inject_preempt_context(struct intel_engine_cs *engine) if (execlists->ctrl_reg) writel(EL_CTRL_LOAD, execlists->ctrl_reg); - execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK); - execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT); + execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); + execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT); + + /* Set a timer to force preemption vs hostile userspace */ + if (execlists->preempt_timeout_ns) + hrtimer_start(&execlists->preempt_timer, + ktime_set(0, execlists->preempt_timeout_ns), + HRTIMER_MODE_REL); +} + +static enum hrtimer_restart preempt_timeout(struct hrtimer *hrtimer) +{ + struct intel_engine_cs *engine = + container_of(hrtimer, typeof(*engine), execlists.preempt_timer); + + if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT)) + queue_work(system_highpri_wq, &engine->execlists.preempt_reset); + + return HRTIMER_NORESTART; +} + +static void preempt_reset(struct work_struct *work) +{ + struct intel_engine_cs *engine = + container_of(work, typeof(*engine), execlists.preempt_reset); + + if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT)) + i915_handle_error(engine->i915, BIT(engine->id), 0, + "preemption timed out on %s", engine->name); } static void update_rps(struct intel_engine_cs *engine) @@ -946,6 +973,7 @@ static void execlists_submission_tasklet(unsigned long data) EXECLISTS_ACTIVE_PREEMPT)); execlists_clear_active(execlists, EXECLISTS_ACTIVE_PREEMPT); + hrtimer_try_to_cancel(&execlists->preempt_timer); continue; } @@ -2180,6 +2208,11 @@ logical_ring_setup(struct intel_engine_cs *engine) tasklet_init(&engine->execlists.tasklet, execlists_submission_tasklet, (unsigned long)engine); + INIT_WORK(&engine->execlists.preempt_reset, preempt_reset); + hrtimer_init(&engine->execlists.preempt_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + engine->execlists.preempt_timer.function = preempt_timeout; + logical_ring_default_vfuncs(engine); logical_ring_default_irqs(engine); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 495b21fc33db..74fcff8a2a6e 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -307,6 +307,10 @@ struct intel_engine_execlists { * @preempt_complete_status: expected CSB upon completing preemption */ u32 preempt_complete_status; + + struct hrtimer preempt_timer; + struct work_struct preempt_reset; + unsigned long preempt_timeout_ns; }; #define INTEL_ENGINE_CS_MAX_NAME 8 diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index c61a51e46eb2..e99705068190 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -1091,7 +1091,7 @@ static int igt_handle_error(void *arg) engine->hangcheck.stalled = true; engine->hangcheck.seqno = intel_engine_get_seqno(engine); - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); + i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__); xchg(&i915->gpu_error.first_error, error); _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx