Quoting Michal Wajdeczko (2017-10-26 18:36:55) > Include GuC and HuC firmware details in captured error state > to provide additional debug information. To reuse existing > uc firmware pretty printer, introduce new drm-printer variant > that works with our i915_error_state_buf output. Also update > uc firmware pretty printer to accept const input. > > v2: don't rely on current caps (Chris) > dump correct fw info (Michal) > v3: simplify capture of custom paths (Chris) > v4: improve 'why' comment (Joonas) > trim output if no fw path (Michal) > group code around uc error state (Michal) > v5: use error in cleanup_uc (Michal) > > Suggested-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_drv.h | 5 +++ > drivers/gpu/drm/i915/i915_gpu_error.c | 65 +++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/intel_uc_fw.c | 6 +++- > drivers/gpu/drm/i915/intel_uc_fw.h | 2 +- > 4 files changed, 76 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 366ba74..f19f0fa 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -911,6 +911,11 @@ struct i915_gpu_state { > struct intel_device_info device_info; > struct i915_params params; > > + struct i915_error_uc { > + struct intel_uc_fw guc_fw; > + struct intel_uc_fw huc_fw; > + } uc; > + > /* Generic register state */ > u32 eir; > u32 pgtbl_er; > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 653fb69..4500fc8 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -30,6 +30,8 @@ > #include <generated/utsrelease.h> > #include <linux/stop_machine.h> > #include <linux/zlib.h> > +#include <drm/drm_print.h> > + > #include "i915_drv.h" > > static const char *engine_str(int engine) > @@ -175,6 +177,21 @@ static void i915_error_puts(struct drm_i915_error_state_buf *e, > #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) > #define err_puts(e, s) i915_error_puts(e, s) > > +static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) > +{ > + i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); > +} > + > +static inline struct drm_printer > +i915_error_printer(struct drm_i915_error_state_buf *e) > +{ > + struct drm_printer p = { > + .printfn = __i915_printfn_error, > + .arg = e, > + }; > + return p; > +} > + > #ifdef CONFIG_DRM_I915_COMPRESS_ERROR > > struct compress { > @@ -589,11 +606,26 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, > pdev->subsystem_device); > } > > +static void err_print_uc(struct drm_i915_error_state_buf *m, > + const struct i915_error_uc *error_uc) > +{ > + struct drm_printer p = i915_error_printer(m); > + const struct i915_gpu_state *error = > + container_of(error_uc, typeof(*error), uc); > + > + if (!error->device_info.has_guc) > + return; I am still not keen on how derived state is mixed in with checking whether or not a piece of fw was presented to HW before the hang, it is still better than before. > + > + intel_uc_fw_dump(&error_uc->guc_fw, &p); > + intel_uc_fw_dump(&error_uc->huc_fw, &p); > +} > + > int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > const struct i915_gpu_state *error) > { > struct drm_i915_private *dev_priv = m->i915; > struct drm_i915_error_object *obj; > + > int i, j; > > if (!error) { > @@ -773,6 +805,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > > err_print_capabilities(m, &error->device_info); > err_print_params(m, &error->params); > + err_print_uc(m, &error->uc); > > if (m->bytes == 0 && m->err) > return m->err; > @@ -831,6 +864,14 @@ static __always_inline void free_param(const char *type, void *x) > kfree(*(void **)x); > } > > +static void cleanup_uc_state(struct i915_gpu_state *error) > +{ > + struct i915_error_uc *error_uc = &error->uc; > + > + kfree(error_uc->guc_fw.path); > + kfree(error_uc->huc_fw.path); > +} > + > void __i915_gpu_state_free(struct kref *error_ref) > { > struct i915_gpu_state *error = > @@ -870,6 +911,8 @@ void __i915_gpu_state_free(struct kref *error_ref) > I915_PARAMS_FOR_EACH(FREE); > #undef FREE > > + cleanup_uc_state(error); > + > kfree(error); > } > > @@ -1559,6 +1602,26 @@ static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv, > error->pinned_bo = bo; > } > > +static void capture_uc_state(struct i915_gpu_state *error) > +{ > + struct drm_i915_private *i915 = error->i915; > + struct i915_error_uc *error_uc = &error->uc; > + > + /* Capturing uC state won't be useful if there is no GuC */ > + if (!error->device_info.has_guc) > + return; > + > + error_uc->guc_fw = i915->guc.fw; > + error_uc->huc_fw = i915->huc.fw; > + > + /* Non-default firmware paths will be specified by the modparam. > + * As modparams are generally accesible from the userspace make > + * explicit copies of the firmware paths. > + */ > + error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); > + error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); > +} > + > static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv, > struct i915_gpu_state *error) > { > @@ -1710,6 +1773,8 @@ static int capture(void *data) > I915_PARAMS_FOR_EACH(DUP); > #undef DUP > > + capture_uc_state(error); > + > i915_capture_gen_state(error->i915, error); > i915_capture_reg_state(error->i915, error); > i915_gem_record_fences(error->i915, error); > diff --git a/drivers/gpu/drm/i915/intel_uc_fw.c b/drivers/gpu/drm/i915/intel_uc_fw.c > index 973888e..79a8797 100644 > --- a/drivers/gpu/drm/i915/intel_uc_fw.c > +++ b/drivers/gpu/drm/i915/intel_uc_fw.c > @@ -299,10 +299,14 @@ void intel_uc_fw_fini(struct intel_uc_fw *uc_fw) > * > * Pretty printer for uC firmware. > */ > -void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p) > +void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p) > { > drm_printf(p, "%s firmware: %s\n", > intel_uc_fw_type_repr(uc_fw->type), uc_fw->path); > + > + if (!uc_fw->path) > + return; This could be NULL simply due to allocation failure. You still want the status and version info. As the path isn't dereferenced here, it is safe enough to drop this chunk, as you currently don't even try and pretty-print the error state unless it is enabled. Removed the chunk and applied. -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx