[PATCH v5 4/5] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Michel Thierry <michel.thierry@xxxxxxxxx>

Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 60000 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

[1] http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-chris@xxxxxxxxxxxxxxxxxx

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

v3: Add a way to get array size, short-cut to disable all thresholds,
return EFAULT / EINVAL as needed. Move the capture of the threshold
value in the error state into a new patch. BXT has a different
timestamp base (because why not?).

v4: Checking if watchdog is available should be the first thing to
do, instead of giving false hopes to abi users; remove unnecessary & in
set_watchdog; ignore args->size in getparam.

v5: GEN9-LP platforms have a different crystal clock frequency, use the
right timestamp base for them (magic 8-ball predicts this will change
again later on, so future-proof it). (Daniele)

v6: Rebase, no more mutex BLK in getparam_ioctl.

v7: use to_intel_context instead of ctx->engine.

v8: Rebase, remove extra mutex from i915_gem_context_set_watchdog (Tvrtko),
Update UAPI to use engine class while keeping thresholds per
engine class (Michel).

v9: Rebase,
    Remove outdated comment from the commit message (Tvrtko)
    Use the engine->flag to verify for gpu watchdog support (Tvrtko)
    Use the standard copy_to_user() instead (Tvrtko)
    Use the correct type when declaring engine class iterator (Tvrtko)
    Remove yet another unncessary mutex_lock (Tvrtko)

v10: Rebase,
    Document uAPI struct drm_i915_watchdog_timeout and use it (Tvrtko)
    Let the compiler takes care of inlines (Tvrtko)
    Make watchdog_to_clock_counts more robust (Tvrtko)

Cc: Antonio Argenziano <antonio.argenziano@xxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx>
Signed-off-by: Michel Thierry <michel.thierry@xxxxxxxxx>
Signed-off-by: Carlos Santa <carlos.santa@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.h         |   3 +
 drivers/gpu/drm/i915/i915_gem_context.c | 150 ++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h             |  17 +++
 3 files changed, 170 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c65c2e6649df..5324397c3801 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1598,6 +1598,9 @@ struct drm_i915_private {
 	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
 	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
 
+	/* Command stream timestamp base - helps define watchdog threshold */
+	u32 cs_timestamp_base;
+
 	unsigned int fsb_freq, mem_freq, is_ddr3;
 	unsigned int skl_preferred_vco_freq;
 	unsigned int max_cdclk_freq;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 9625b5f7faf7..cfd33ca5c13f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -878,6 +878,149 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+/*
+ * BDW, CHV & SKL+ Timestamp timer resolution = 0.080 uSec,
+ * or 12500000 counts per second, or ~12 counts per microsecond.
+ *
+ * But BXT/GLK Timestamp timer resolution is different, 0.052 uSec,
+ * or 19200000 counts per second, or ~19 counts per microsecond.
+ *
+ * Future-proofing, some day it won't be as simple as just GEN & IS_LP.
+ */
+#define GEN8_TIMESTAMP_CNTS_PER_USEC 12
+#define GEN9_LP_TIMESTAMP_CNTS_PER_USEC 19
+u32 cs_timestamp_in_us(struct drm_i915_private *i915)
+{
+	u32 cs_timestamp_base = i915->cs_timestamp_base;
+
+	if (cs_timestamp_base)
+		return cs_timestamp_base;
+
+	switch (INTEL_GEN(i915)) {
+	default:
+		MISSING_CASE(INTEL_GEN(i915));
+		/* fall through */
+	case 9:
+		cs_timestamp_base = IS_GEN9_LP(i915) ?
+					GEN9_LP_TIMESTAMP_CNTS_PER_USEC :
+					GEN8_TIMESTAMP_CNTS_PER_USEC;
+		break;
+	case 8:
+		cs_timestamp_base = GEN8_TIMESTAMP_CNTS_PER_USEC;
+		break;
+	}
+
+	i915->cs_timestamp_base = cs_timestamp_base;
+	return cs_timestamp_base;
+}
+
+u32 watchdog_to_us(struct drm_i915_private *i915, u32 value_in_clock_counts)
+{
+	return value_in_clock_counts / cs_timestamp_in_us(i915);
+}
+
+int watchdog_to_clock_counts(struct drm_i915_private *i915, u32 *value_in_us)
+{
+	u64 threshold = *value_in_us * cs_timestamp_in_us(i915);
+	int err = 0;
+
+	if (overflows_type(threshold, u64))
+		return -E2BIG;
+
+	*value_in_us = threshold;
+
+	return err;
+}
+
+/* On success copies to userspace the threshold value for the
+ * watchdog timer calculated in terms of clock_counts / timestamp (us)
+ */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+				  struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *i915 = ctx->i915;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	struct drm_i915_gem_watchdog_timeout threshold_in_us[OTHER_CLASS];
+
+	for_each_engine(engine, i915, id) {
+		/* not supported in blitter engine */
+		if (id!=BCS0 && !intel_engine_supports_watchdog(i915->engine[id]))
+			return -ENODEV;
+	}
+
+	for_each_engine(engine, i915, id) {
+		struct intel_context *ce = intel_context_lookup(ctx, engine);
+
+		threshold_in_us[engine->class].timeout_us = watchdog_to_us(i915,
+								ce->watchdog_threshold);
+	}
+
+	if (copy_to_user(u64_to_user_ptr(args->value),
+			   &threshold_in_us,
+			   sizeof(threshold_in_us))) {
+		return -EFAULT;
+	}
+
+	args->size = sizeof(threshold_in_us);
+
+	return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by setting it to 0.
+ */
+int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
+				  struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *i915 = ctx->i915;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int i, err = 0;
+	struct drm_i915_gem_watchdog_timeout threshold[OTHER_CLASS];
+
+	for_each_engine(engine, i915, id) {
+		if (id!=BCS0 && !intel_engine_supports_watchdog(i915->engine[id]))
+			return -ENODEV;
+	}
+
+	memset(threshold, 0, sizeof(threshold));
+
+	/* shortcut to disable in all engines */
+	if (args->size == 0)
+		goto set_watchdog;
+
+	if (args->size < sizeof(threshold))
+		return -EFAULT;
+
+	if (copy_from_user(threshold,
+			   u64_to_user_ptr(args->value),
+			   sizeof(threshold))) {
+		return -EFAULT;
+	}
+
+	/* not supported in blitter engine */
+	if (threshold[COPY_ENGINE_CLASS].timeout_us > 0)
+		return -EINVAL;
+
+	for (i = RENDER_CLASS; i < OTHER_CLASS; i++) {
+		err = watchdog_to_clock_counts(i915, &threshold[i].timeout_us);
+		if (err)
+			return -EINVAL;
+	}
+
+set_watchdog:
+	for_each_engine(engine, i915, id) {
+		struct intel_context *ce = intel_context_lookup(ctx, engine);
+
+		ce->watchdog_threshold = threshold[engine->class].timeout_us;
+	}
+
+	return 0;
+}
+
 static int get_sseu(struct i915_gem_context *ctx,
 		    struct drm_i915_gem_context_param *args)
 {
@@ -970,6 +1113,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->size = 0;
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = i915_gem_context_get_watchdog(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_SSEU:
 		ret = get_sseu(ctx, args);
 		break;
@@ -1335,6 +1482,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_SSEU:
 		ret = set_sseu(ctx, args);
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = i915_gem_context_set_watchdog(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 5e7bc6412880..3b1bfb9996ea 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1440,6 +1440,7 @@ struct drm_i915_reg_read {
 	 */
 	__u64 offset;
 #define I915_REG_READ_8B_WA (1ul << 0)
+#define I915_CONTEXT_PARAM_WATCHDOG	0x10
 
 	__u64 val; /* Return value */
 };
@@ -1472,6 +1473,22 @@ struct drm_i915_reset_stats {
 
 };
 
+struct drm_i915_gem_watchdog_timeout {
+	union {
+		struct {
+			/*
+			 * Engine class & instance to be configured or queried.
+			 */
+			__u16 engine_class;
+			__u16 engine_instance;
+		};
+		/* Index based addressing mode */
+		__u32 index;
+	};
+	/* GPU Engine watchdog reset timeout in us */
+	__u32 timeout_us;
+};
+
 struct drm_i915_gem_userptr {
 	__u64 user_ptr;
 	__u64 user_size;
-- 
2.17.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux