Final enablement patch for GPU hang detection using watchdog timeout. Using the gem_context_setparam ioctl, users can specify the desired timeout value in milliseconds, and the driver will do the conversion to 'timestamps'. The _recommended_ default watchdog threshold for video engines is 60 ms, since this has been _empirically determined_ to be a good compromise for low-latency requirements and low rate of false positives. The default register value is ~106ms and the theoretical max value (all 1s) is 353 seconds. Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> Signed-off-by: Arun Siluvery <arun.siluvery@xxxxxxxxxxxxxxx> Signed-off-by: Michel Thierry <michel.thierry@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem_context.c | 46 ++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_context.h | 3 ++ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 8 +----- include/uapi/drm/i915_drm.h | 1 + 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 99c46f4dbde6..c3748878e64c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -440,6 +440,32 @@ i915_gem_context_create_gvt(struct drm_device *dev) return ctx; } +void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms) +{ + /* + * Based on time out value (ms) calculate + * timer count thresholds needed based on core frequency. + */ +#define TIMER_MILLISECOND 1000 + + /* + * Timestamp timer resolution = 0.080 uSec, + * or 12500000 counts per second + */ +#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000 + + ctx->watchdog_threshold = + ((value_in_ms) * + ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND))); + + /* + * watchdog register must never be programmed to zero. This would + * cause the watchdog counter to exceed and not allow the engine to + * go into IDLE state + */ + GEM_BUG_ON(ctx->watchdog_threshold == 0); +} + int i915_gem_context_init(struct drm_i915_private *dev_priv) { struct i915_gem_context *ctx; @@ -1056,6 +1082,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, struct drm_i915_file_private *file_priv = file->driver_priv; struct drm_i915_gem_context_param *args = data; struct i915_gem_context *ctx; + struct intel_engine_cs *engine; int ret; ret = i915_mutex_lock_interruptible(dev); @@ -1090,6 +1117,15 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_BANNABLE: args->value = i915_gem_context_is_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + engine = to_i915(dev)->engine[VCS]; + if (!engine->emit_start_watchdog) + ret = -EINVAL; + else if (args->value) + ret = -EINVAL; + else + args->value = ctx->watchdog_threshold; + break; default: ret = -EINVAL; break; @@ -1105,6 +1141,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, struct drm_i915_file_private *file_priv = file->driver_priv; struct drm_i915_gem_context_param *args = data; struct i915_gem_context *ctx; + struct intel_engine_cs *engine; int ret; ret = i915_mutex_lock_interruptible(dev); @@ -1147,6 +1184,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, else i915_gem_context_clear_bannable(ctx); break; + case I915_CONTEXT_PARAM_WATCHDOG: + engine = to_i915(dev)->engine[VCS]; + if (!engine->emit_start_watchdog) + ret = -EINVAL; + else if (!args->value) + ret = -EINVAL; + else + i915_context_watchdog_setup(ctx, args->value); + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 0ac750b90f3d..133ed7b413aa 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -176,6 +176,9 @@ struct i915_gem_context { /** ban_score: Accumulated score of all hangs caused by this context. */ int ban_score; + /** watchdog_threshold: hw watchdog threshold value, in clock counts */ + u32 watchdog_threshold; + /** remap_slice: Bitmask of cache lines that need remapping */ u8 remap_slice; }; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 348d81c40e81..26c50a6d6158 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1419,12 +1419,6 @@ execbuf_submit(struct i915_execbuffer_params *params, bool watchdog_running = false; int ret; - /* - * NB: Place-holder until watchdog timeout is enabled through DRM - * interface - */ - bool enable_watchdog = false; - ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas); if (ret) return ret; @@ -1488,7 +1482,7 @@ execbuf_submit(struct i915_execbuffer_params *params, } /* Start watchdog timer */ - if (enable_watchdog) { + if (params->ctx->watchdog_threshold != 0) { if (!params->engine->emit_start_watchdog) return -EINVAL; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 3554495bef13..e318c4f53a9e 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1289,6 +1289,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 #define I915_CONTEXT_PARAM_BANNABLE 0x5 +#define I915_CONTEXT_PARAM_WATCHDOG 0x6 __u64 value; }; -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx