On Tue, May 16, 2017 at 02:52:53PM -0700, Daniele Ceraolo Spurio wrote: > We're currently deleting the GuC logs if the FW fails to load, but those > are still useful to understand why the loading failed. Keeping the > object around allows us to access them after driver load is completed. > > v2: keep the object around instead of using kernel memory (chris) > don't store the logs in the gpu_error struct (Chris) > add a check on guc_log_level to avoid snapshotting empty logs > > v3: use separate debugfs for error log (Chris) > > v4: rebased > > v5: clean up obj selection, move err_load inside guc_log, move err_load > cleanup, rename functions (Michal) > > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Oscar Mateo <oscar.mateo@xxxxxxxxx> > Cc: Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> > Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 38 +++++++++++++++++++++++------------- > drivers/gpu/drm/i915/intel_guc_log.c | 17 ++++++++++++++++ > drivers/gpu/drm/i915/intel_uc.c | 14 +++++++++++-- > drivers/gpu/drm/i915/intel_uc.h | 5 +++++ > 4 files changed, 58 insertions(+), 16 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index bd9abef..740395c 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -2607,27 +2607,36 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data) > > static int i915_guc_log_dump(struct seq_file *m, void *data) > { > - struct drm_i915_private *dev_priv = node_to_i915(m->private); > - struct drm_i915_gem_object *obj; > - int i = 0, pg; > - > - if (!dev_priv->guc.log.vma) > - return 0; > + struct drm_info_node *node = m->private; > + struct drm_i915_private *dev_priv = node_to_i915(node); > + bool dump_load_err = !!node->info_ent->data; > + struct drm_i915_gem_object *obj = NULL; > + u32 *log; > + int i = 0; > > - obj = dev_priv->guc.log.vma->obj; > - for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) { > - u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg)); > + if (dump_load_err) > + obj = dev_priv->guc.log.load_err; > + else if (dev_priv->guc.log.vma) > + obj = dev_priv->guc.log.vma->obj; > > - for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4) > - seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n", > - *(log + i), *(log + i + 1), > - *(log + i + 2), *(log + i + 3)); > + if (!obj) > + return 0; > > - kunmap_atomic(log); > + log = i915_gem_object_pin_map(obj, I915_MAP_WC); > + if (IS_ERR(log)) { > + seq_puts(m, "Failed to pin object\n"); Hmm, quite unusual to report internal message. Maybe: DRM_DEBUG("Failed to pin object\n"); seq_puts(m, "(log data unaccessible)\n"); > + return PTR_ERR(log); > } > > + for (i = 0; i < obj->base.size / sizeof(u32); i += 4) > + seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n", > + *(log + i), *(log + i + 1), > + *(log + i + 2), *(log + i + 3)); > + > seq_putc(m, '\n'); > > + i915_gem_object_unpin_map(obj); > + > return 0; > } > > @@ -4811,6 +4820,7 @@ static int i915_hpd_storm_ctl_open(struct inode *inode, struct file *file) > {"i915_guc_info", i915_guc_info, 0}, > {"i915_guc_load_status", i915_guc_load_status_info, 0}, > {"i915_guc_log_dump", i915_guc_log_dump, 0}, > + {"i915_guc_load_err_log_dump", i915_guc_log_dump, 0, (void *)1}, bikeshed: quite long name, maybe "i915_guc_load_log_dump" ? > {"i915_guc_stage_pool", i915_guc_stage_pool, 0}, > {"i915_huc_load_status", i915_huc_load_status_info, 0}, > {"i915_frequency_info", i915_frequency_info, 0}, > diff --git a/drivers/gpu/drm/i915/intel_guc_log.c b/drivers/gpu/drm/i915/intel_guc_log.c > index 16d3b87..31e7bec 100644 > --- a/drivers/gpu/drm/i915/intel_guc_log.c > +++ b/drivers/gpu/drm/i915/intel_guc_log.c > @@ -660,3 +660,20 @@ void i915_guc_log_unregister(struct drm_i915_private *dev_priv) > guc_log_runtime_destroy(&dev_priv->guc); > mutex_unlock(&dev_priv->drm.struct_mutex); > } > + > +void intel_guc_log_capture_load_err(struct intel_guc *guc) > +{ > + if (!guc->log.vma || i915.guc_log_level < 0) > + return; > + > + if (!guc->log.load_err) > + guc->log.load_err = i915_gem_object_get(guc->log.vma->obj); > + > + return; > +} > + > +void intel_guc_log_free_load_err(struct intel_guc *guc) > +{ > + if (guc->log.load_err) > + i915_gem_object_put(guc->log.load_err); > +} Based on the discussion from IM, it looks that it would be better to keep guc->load_err object directly in struct intel_guc, and at the same time move both capture/free functions to guc_uc.c as static: void guc_capture_load_err_log(struct intel_guc *guc) void guc_free_load_err_log(struct intel_guc *guc) > diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c > index 07c5658..dfea7d0 100644 > --- a/drivers/gpu/drm/i915/intel_uc.c > +++ b/drivers/gpu/drm/i915/intel_uc.c > @@ -309,6 +309,7 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv) > > guc_disable_communication(guc); > gen9_reset_guc_interrupts(dev_priv); > + intel_guc_log_free_load_err(guc); Hmm, this will free the log from previous failed load, but I was under impression that you want to keep it... Maybe log reset should be done as explicit .write operation ? > > /* We need to notify the guc whenever we change the GGTT */ > i915_ggtt_enable_guc(dev_priv); > @@ -355,11 +356,11 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv) > > /* Did we succeded or run out of retries? */ > if (ret) > - goto err_submission; > + goto err_log_capture; > > ret = guc_enable_communication(guc); > if (ret) > - goto err_submission; > + goto err_log_capture; > > intel_guc_auth_huc(dev_priv); > if (i915.enable_guc_submission) { > @@ -385,6 +386,8 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv) > err_interrupts: > guc_disable_communication(guc); > gen9_disable_guc_interrupts(dev_priv); > +err_log_capture: > + intel_guc_log_capture_load_err(guc); > err_submission: > if (i915.enable_guc_submission) > i915_guc_submission_fini(dev_priv); > @@ -407,6 +410,13 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv) > > void intel_uc_fini_hw(struct drm_i915_private *dev_priv) > { > + /* > + * The err load log exists if guc failed to load, so we need to check > + * for it before checking the module params as we might have updated > + * those to reflect the load failure. Hmm, but it looks that for now we only touch enable_guc_loading in sanitize functions. On load failure we just disable enable_guc_submission param (which btw is a bad decission IMHO) > + */ > + intel_guc_log_free_load_err(&dev_priv->guc); > + > if (!i915.enable_guc_loading) > return; > > diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h > index 7618b71..05fa751 100644 > --- a/drivers/gpu/drm/i915/intel_uc.h > +++ b/drivers/gpu/drm/i915/intel_uc.h > @@ -176,6 +176,9 @@ struct intel_guc_log { > u32 prev_overflow_count[GUC_MAX_LOG_BUFFER]; > u32 total_overflow_count[GUC_MAX_LOG_BUFFER]; > u32 flush_count[GUC_MAX_LOG_BUFFER]; > + > + /* Log snapshot if GuC errors during load */ > + struct drm_i915_gem_object *load_err; As mentioned above, please move it back to guc struct, but keep close to the fw/log members. Thanks, Michal > }; > > struct intel_guc { > @@ -272,6 +275,8 @@ static inline void intel_guc_notify(struct intel_guc *guc) > int i915_guc_log_control(struct drm_i915_private *dev_priv, u64 control_val); > void i915_guc_log_register(struct drm_i915_private *dev_priv); > void i915_guc_log_unregister(struct drm_i915_private *dev_priv); > +void intel_guc_log_capture_load_err(struct intel_guc *guc); > +void intel_guc_log_free_load_err(struct intel_guc *guc); > > static inline u32 guc_ggtt_offset(struct i915_vma *vma) > { > -- > 1.9.1 > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx