> -----Original Message----- > From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> > Sent: Monday, February 26, 2024 1:56 AM > To: Vivi, Rodrigo <rodrigo.vivi@xxxxxxxxx> > Cc: Intel-gfx@xxxxxxxxxxxxxxxxxxxxx; dri-devel@xxxxxxxxxxxxxxxxxxxxx; Ursulin, > Tvrtko <tvrtko.ursulin@xxxxxxxxx>; Landwerlin, Lionel G > <lionel.g.landwerlin@xxxxxxxxx>; Santa, Carlos <carlos.santa@xxxxxxxxx> > Subject: Re: [PATCH 2/2] drm/i915: Support replaying GPU hangs with > captured context image > > > > On 22/02/2024 21:07, Rodrigo Vivi wrote: > > On Wed, Feb 21, 2024 at 02:22:45PM +0000, Tvrtko Ursulin wrote: > >> From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > >> > >> When debugging GPU hangs Mesa developers are finding it useful to > >> replay the captured error state against the simulator. But due > >> various simulator limitations which prevent replicating all hangs, > >> one step further is being able to replay against a real GPU. > >> > >> This is almost doable today with the missing part being able to > >> upload the captured context image into the driver state prior to > >> executing the uploaded hanging batch and all the buffers. > >> > >> To enable this last part we add a new context parameter called > >> I915_CONTEXT_PARAM_CONTEXT_IMAGE. It follows the existing SSEU > >> configuration pattern of being able to select which context to apply > >> against, paired with the actual image and its size. > >> > >> Since this is adding a new concept of debug only uapi, we hide it > >> behind a new kconfig option and also require activation with a module > parameter. > >> Together with a warning banner printed at driver load, all those > >> combined should be sufficient to guard against inadvertently enabling the > feature. > >> > >> In terms of implementation we allow the legacy context set param to > >> be used since that removes the need to record the per context data in > >> the proto context, while still allowing flexibility of specifying > >> context images for any context. > >> > >> Mesa MR using the uapi can be seen at: > >> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27594 > >> > >> v2: > >> * Fix whitespace alignment as per checkpatch. > >> * Added warning on userspace misuse. > >> * Rebase for extracting ce->default_state shadowing. > >> > >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > >> Cc: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx> > >> Cc: Carlos Santa <carlos.santa@xxxxxxxxx> > >> Cc: Rodrigo Vivi <rodrigo.vivi@xxxxxxxxx> > >> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@xxxxxxxxx> # v1 > > > > still valid for v2. Thanks for splitting the patch. > > Great, thanks! > > Now we need to hear from Lionel if he is still keen to have this. In which case > some acks or tested by would be good. > > Regards, > > Tvrtko I tested a setup at my end and it reproduces the GPU hang. I am on a TGL based NUC system running Ubuntu 22.04. There's a one-liner change that's needed in the corresponding Mesa but I've communicated that to Lionel already. You can use my Tested-by: Carlos Santa <carlos.santa@xxxxxxxxx> > > >> --- > >> drivers/gpu/drm/i915/Kconfig.debug | 17 +++ > >> drivers/gpu/drm/i915/gem/i915_gem_context.c | 113 > ++++++++++++++++++ > >> drivers/gpu/drm/i915/gt/intel_context.c | 2 + > >> drivers/gpu/drm/i915/gt/intel_context.h | 22 ++++ > >> drivers/gpu/drm/i915/gt/intel_context_types.h | 1 + > >> drivers/gpu/drm/i915/gt/intel_lrc.c | 3 +- > >> .../gpu/drm/i915/gt/intel_ring_submission.c | 3 +- > >> drivers/gpu/drm/i915/i915_params.c | 5 + > >> drivers/gpu/drm/i915/i915_params.h | 3 +- > >> include/uapi/drm/i915_drm.h | 27 +++++ > >> 10 files changed, 193 insertions(+), 3 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/i915/Kconfig.debug > >> b/drivers/gpu/drm/i915/Kconfig.debug > >> index 5b7162076850..32e9f70e91ed 100644 > >> --- a/drivers/gpu/drm/i915/Kconfig.debug > >> +++ b/drivers/gpu/drm/i915/Kconfig.debug > >> @@ -16,6 +16,23 @@ config DRM_I915_WERROR > >> > >> If in doubt, say "N". > >> > >> +config DRM_I915_REPLAY_GPU_HANGS_API > >> + bool "Enable GPU hang replay userspace API" > >> + depends on DRM_I915 > >> + depends on EXPERT > >> + default n > >> + help > >> + Choose this option if you want to enable special and unstable > >> + userspace API used for replaying GPU hangs on a running system. > >> + > >> + This API is intended to be used by userspace graphics stack > developers > >> + and provides no stability guarantees. > >> + > >> + The API needs to be activated at boot time using the > >> + enable_debug_only_api module parameter. > >> + > >> + If in doubt, say "N". > >> + > >> config DRM_I915_DEBUG > >> bool "Enable additional driver debugging" > >> depends on DRM_I915 > >> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c > >> b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >> index dcbfe32fd30c..481aacbc1772 100644 > >> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > >> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > >> @@ -78,6 +78,7 @@ > >> #include "gt/intel_engine_user.h" > >> #include "gt/intel_gpu_commands.h" > >> #include "gt/intel_ring.h" > >> +#include "gt/shmem_utils.h" > >> > >> #include "pxp/intel_pxp.h" > >> > >> @@ -949,6 +950,7 @@ static int set_proto_ctx_param(struct > drm_i915_file_private *fpriv, > >> case I915_CONTEXT_PARAM_NO_ZEROMAP: > >> case I915_CONTEXT_PARAM_BAN_PERIOD: > >> case I915_CONTEXT_PARAM_RINGSIZE: > >> + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: > >> default: > >> ret = -EINVAL; > >> break; > >> @@ -2092,6 +2094,95 @@ static int get_protected(struct > i915_gem_context *ctx, > >> return 0; > >> } > >> > >> +static int set_context_image(struct i915_gem_context *ctx, > >> + struct drm_i915_gem_context_param *args) { > >> + struct i915_gem_context_param_context_image user; > >> + struct intel_context *ce; > >> + struct file *shmem_state; > >> + unsigned long lookup; > >> + void *state; > >> + int ret = 0; > >> + > >> + if (!IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)) > >> + return -EINVAL; > >> + > >> + if (!ctx->i915->params.enable_debug_only_api) > >> + return -EINVAL; > >> + > >> + if (args->size < sizeof(user)) > >> + return -EINVAL; > >> + > >> + if (copy_from_user(&user, u64_to_user_ptr(args->value), > sizeof(user))) > >> + return -EFAULT; > >> + > >> + if (user.mbz) > >> + return -EINVAL; > >> + > >> + if (user.flags & ~(I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX)) > >> + return -EINVAL; > >> + > >> + lookup = 0; > >> + if (user.flags & I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX) > >> + lookup |= LOOKUP_USER_INDEX; > >> + > >> + ce = lookup_user_engine(ctx, lookup, &user.engine); > >> + if (IS_ERR(ce)) > >> + return PTR_ERR(ce); > >> + > >> + if (user.size < ce->engine->context_size) { > >> + ret = -EINVAL; > >> + goto out_ce; > >> + } > >> + > >> + if (drm_WARN_ON_ONCE(&ctx->i915->drm, > >> + test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { > >> + /* > >> + * This is racy but for a debug only API, if userspace is keen > >> + * to create and configure contexts, while simultaneously > using > >> + * them from a second thread, let them suffer by potentially > not > >> + * executing with the context image they just raced to apply. > >> + */ > >> + ret = -EBUSY; > >> + goto out_ce; > >> + } > >> + > >> + state = kmalloc(ce->engine->context_size, GFP_KERNEL); > >> + if (!state) { > >> + ret = -ENOMEM; > >> + goto out_ce; > >> + } > >> + > >> + if (copy_from_user(state, u64_to_user_ptr(user.image), > >> + ce->engine->context_size)) { > >> + ret = -EFAULT; > >> + goto out_state; > >> + } > >> + > >> + shmem_state = shmem_create_from_data(ce->engine->name, > >> + state, ce->engine->context_size); > >> + if (IS_ERR(shmem_state)) { > >> + ret = PTR_ERR(shmem_state); > >> + goto out_state; > >> + } > >> + > >> + if (intel_context_set_own_state(ce)) { > >> + ret = -EBUSY; > >> + fput(shmem_state); > >> + goto out_state; > >> + } > >> + > >> + ce->default_state = shmem_state; > >> + > >> + args->size = sizeof(user); > >> + > >> +out_state: > >> + kfree(state); > >> +out_ce: > >> + intel_context_put(ce); > >> + return ret; > >> +} > >> + > >> static int ctx_setparam(struct drm_i915_file_private *fpriv, > >> struct i915_gem_context *ctx, > >> struct drm_i915_gem_context_param *args) @@ - > 2144,6 +2235,10 @@ > >> static int ctx_setparam(struct drm_i915_file_private *fpriv, > >> ret = set_persistence(ctx, args); > >> break; > >> > >> + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: > >> + ret = set_context_image(ctx, args); > >> + break; > >> + > >> case I915_CONTEXT_PARAM_PROTECTED_CONTENT: > >> case I915_CONTEXT_PARAM_NO_ZEROMAP: > >> case I915_CONTEXT_PARAM_BAN_PERIOD: > >> @@ -2488,6 +2583,7 @@ int i915_gem_context_getparam_ioctl(struct > drm_device *dev, void *data, > >> case I915_CONTEXT_PARAM_BAN_PERIOD: > >> case I915_CONTEXT_PARAM_ENGINES: > >> case I915_CONTEXT_PARAM_RINGSIZE: > >> + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: > >> default: > >> ret = -EINVAL; > >> break; > >> @@ -2600,5 +2696,22 @@ int __init > i915_gem_context_module_init(void) > >> if (!slab_luts) > >> return -ENOMEM; > >> > >> + if (IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)) { > >> + > pr_notice("************************************************** > ************\n"); > >> + pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE > NOTICE NOTICE **\n"); > >> + pr_notice("** **\n"); > >> + if (i915_modparams.enable_debug_only_api) > >> + pr_notice("** i915.enable_debug_only_api is > intended to be set **\n"); > >> + else > >> + pr_notice("** > CONFIG_DRM_I915_REPLAY_GPU_HANGS_API builds are intended **\n"); > >> + pr_notice("** for specific userspace graphics stack developers > only! **\n"); > >> + pr_notice("** **\n"); > >> + pr_notice("** If you are seeing this message please report this > to the **\n"); > >> + pr_notice("** provider of your kernel build. > **\n"); > >> + pr_notice("** **\n"); > >> + pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE > NOTICE NOTICE **\n"); > >> + > pr_notice("************************************************** > ************\n"); > >> + } > >> + > >> return 0; > >> } > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c > >> b/drivers/gpu/drm/i915/gt/intel_context.c > >> index a2f1245741bb..b1b8695ba7c9 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_context.c > >> @@ -27,6 +27,8 @@ static void rcu_context_free(struct rcu_head *rcu) > >> struct intel_context *ce = container_of(rcu, typeof(*ce), rcu); > >> > >> trace_intel_context_free(ce); > >> + if (intel_context_has_own_state(ce)) > >> + fput(ce->default_state); > >> kmem_cache_free(slab_ce, ce); > >> } > >> > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h > >> b/drivers/gpu/drm/i915/gt/intel_context.h > >> index 25564c01507e..9f523999acd1 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_context.h > >> @@ -375,6 +375,28 @@ intel_context_clear_nopreempt(struct > intel_context *ce) > >> clear_bit(CONTEXT_NOPREEMPT, &ce->flags); > >> } > >> > >> +#if IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) > >> +static inline bool intel_context_has_own_state(const struct > >> +intel_context *ce) { > >> + return test_bit(CONTEXT_OWN_STATE, &ce->flags); } > >> + > >> +static inline bool intel_context_set_own_state(struct intel_context > >> +*ce) { > >> + return test_and_set_bit(CONTEXT_OWN_STATE, &ce->flags); } #else > >> +static inline bool intel_context_has_own_state(const struct > >> +intel_context *ce) { > >> + return false; > >> +} > >> + > >> +static inline bool intel_context_set_own_state(struct intel_context > >> +*ce) { > >> + return true; > >> +} > >> +#endif > >> + > >> u64 intel_context_get_total_runtime_ns(struct intel_context *ce); > >> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce); > >> > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h > >> b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> index b179178680a5..d91b33c3664c 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> @@ -132,6 +132,7 @@ struct intel_context { > >> #define CONTEXT_PERMA_PIN 11 > >> #define CONTEXT_IS_PARKING 12 > >> #define CONTEXT_EXITING 13 > >> +#define CONTEXT_OWN_STATE 14 > >> > >> struct { > >> u64 timeout_us; > >> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c > >> b/drivers/gpu/drm/i915/gt/intel_lrc.c > >> index d4eb822d20ae..1038659754f8 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > >> @@ -1173,7 +1173,8 @@ int lrc_alloc(struct intel_context *ce, struct > >> intel_engine_cs *engine) > >> > >> GEM_BUG_ON(ce->state); > >> > >> - ce->default_state = engine->default_state; > >> + if (!intel_context_has_own_state(ce)) > >> + ce->default_state = engine->default_state; > >> > >> vma = __lrc_alloc_state(ce, engine); > >> if (IS_ERR(vma)) > >> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c > >> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c > >> index 8625e88e785f..72277bc8322e 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c > >> @@ -569,7 +569,8 @@ static int ring_context_alloc(struct intel_context > *ce) > >> { > >> struct intel_engine_cs *engine = ce->engine; > >> > >> - ce->default_state = engine->default_state; > >> + if (!intel_context_has_own_state(ce)) > >> + ce->default_state = engine->default_state; > >> > >> /* One ringbuffer to rule them all */ > >> GEM_BUG_ON(!engine->legacy.ring); > >> diff --git a/drivers/gpu/drm/i915/i915_params.c > >> b/drivers/gpu/drm/i915/i915_params.c > >> index de43048543e8..afd95b2b7e4b 100644 > >> --- a/drivers/gpu/drm/i915/i915_params.c > >> +++ b/drivers/gpu/drm/i915/i915_params.c > >> @@ -134,6 +134,11 @@ i915_param_named_unsafe(lmem_size, uint, > 0400, > >> i915_param_named_unsafe(lmem_bar_size, uint, 0400, > >> "Set the lmem bar size(in MiB)."); > >> > >> +#if IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) > >> +i915_param_named(enable_debug_only_api, bool, 0400, > >> + "Enable support for unstable debug only userspace API. > >> +(default:false)"); #endif > >> + > >> static void _param_print_bool(struct drm_printer *p, const char *name, > >> bool val) > >> { > >> diff --git a/drivers/gpu/drm/i915/i915_params.h > >> b/drivers/gpu/drm/i915/i915_params.h > >> index 1315d7fac850..e2cdf12ce611 100644 > >> --- a/drivers/gpu/drm/i915/i915_params.h > >> +++ b/drivers/gpu/drm/i915/i915_params.h > >> @@ -64,7 +64,8 @@ struct drm_printer; > >> /* leave bools at the end to not create holes */ \ > >> param(bool, enable_hangcheck, true, 0600) \ > >> param(bool, error_capture, true, > IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) ? 0600 : 0) \ > >> - param(bool, enable_gvt, false, IS_ENABLED(CONFIG_DRM_I915_GVT) > ? 0400 : 0) > >> + param(bool, enable_gvt, false, IS_ENABLED(CONFIG_DRM_I915_GVT) > ? 0400 : 0) \ > >> + param(bool, enable_debug_only_api, false, > >> +IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) ? 0400 : 0) > >> > >> #define MEMBER(T, member, ...) T member; > >> struct i915_params { > >> diff --git a/include/uapi/drm/i915_drm.h > >> b/include/uapi/drm/i915_drm.h index 2ee338860b7e..e194d17c3568 > 100644 > >> --- a/include/uapi/drm/i915_drm.h > >> +++ b/include/uapi/drm/i915_drm.h > >> @@ -2154,6 +2154,15 @@ struct drm_i915_gem_context_param { > >> __u64 value; > >> }; > >> > >> +/* > >> + * I915_CONTEXT_PARAM_CONTEXT_IMAGE: > >> + * > >> + * Allows userspace to provide own context images. > >> + * > >> + * Note that this is a debug API not available on production kernel builds. > >> + */ > >> +#define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xe > >> + > >> /* > >> * Context SSEU programming > >> * > >> @@ -2549,6 +2558,24 @@ struct i915_context_param_engines { > >> struct i915_engine_class_instance engines[N__]; \ > >> } __attribute__((packed)) name__ > >> > >> +struct i915_gem_context_param_context_image { > >> + /** @engine: Engine class & instance to be configured. */ > >> + struct i915_engine_class_instance engine; > >> + > >> + /** @flags: One of the supported flags or zero. */ > >> + __u32 flags; > >> +#define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0) > >> + > >> + /** @size: Size of the image blob pointed to by @image. */ > >> + __u32 size; > >> + > >> + /** @mbz: Must be zero. */ > >> + __u32 mbz; > >> + > >> + /** @image: Userspace memory containing the context image. */ > >> + __u64 image; > >> +} __attribute__((packed)); > >> + > >> /** > >> * struct drm_i915_gem_context_create_ext_setparam - Context > parameter > >> * to set or query during context creation. > >> -- > >> 2.40.1 > >>