On Tue, 2025-01-28 at 10:36 -0800, Teres Alexis, Alan Previn wrote: > GuC-Err-Capture should not be storing register snapshot > nodes directly inside of the top level xe_devcoredump_snapshot > structure that it doesn't control. Furthermore, that is > is not right from a driver subsystem layering perspective. > > alan:snip > diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c > index a99e3160724b..26006d72904f 100644 > --- a/drivers/gpu/drm/xe/xe_hw_engine.c > +++ b/drivers/gpu/drm/xe/xe_hw_engine.c > @@ -25,6 +25,7 @@ > #include "xe_gt_mcr.h" > #include "xe_gt_topology.h" > #include "xe_guc_capture.h" > +#include "xe_guc_capture_snapshot_types.h" > #include "xe_hw_engine_group.h" > #include "xe_hw_fence.h" > #include "xe_irq.h" > @@ -867,22 +868,20 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_exec_queue *q) > return snapshot; > > if (q) { > - /* If got guc capture, set source to GuC */ > - node = xe_guc_capture_get_matching_and_lock(q); > - if (node) { > - struct xe_device *xe = gt_to_xe(hwe->gt); > - struct xe_devcoredump *coredump = &xe->devcoredump; > - > - coredump->snapshot.matched_node = node; > - xe_gt_dbg(hwe->gt, "Found and locked GuC-err-capture node"); > - return snapshot; > + /* First, retrieve the manual GuC-Error-Capture node if it exists */ > + node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_MANUAL); > + /* Find preferred node type sourced from firmware if available */ > + snapshot->matched_node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_GUC); > + if (!snapshot->matched_node) { > + xe_gt_dbg(hwe->gt, "No fw sourced GuC-Err-Capture for queue %s", q->name); > + snapshot->matched_node = node; > + } else if (node) { > + xe_guc_capture_put_matched_nodes(&hwe->gt->uc.guc, node); > } > + if (!snapshot->matched_node) > + xe_gt_warn(hwe->gt, "Can't retrieve any GuC-Err-Capture node"); alan: a couple of the CI full-test failures was caused by this. It turns out that we have other code paths that can attempt to generate a xe_devcoredump without being triggered from a timed-out-job event. John Harrison fedback that such cases are still valid so this should be a xe_gt_dbg, not xe_gt_warn. Additionally, we agreed that there is value in reporting such cases in the dump file. So as opposed to "GuC source" vs "Manual source" engine dumps we could add additional differentiation Guc-src vs Manual-early vs Manual-late. Will add that in next rev. > } > > - /* otherwise, do manual capture */ > - xe_engine_manual_capture(hwe, snapshot); > - xe_gt_dbg(hwe->gt, "Proceeding with manual engine snapshot"); > - > return snapshot; > } > > @@ -900,12 +899,7 @@ void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot) > return; > > gt = snapshot->hwe->gt; > - /* > - * xe_guc_capture_put_matched_nodes is called here and from > - * xe_devcoredump_snapshot_free, to cover the 2 calling paths > - * of hw_engines - debugfs and devcoredump free. > - */ > - xe_guc_capture_put_matched_nodes(>->uc.guc); > + xe_guc_capture_put_matched_nodes(>->uc.guc, snapshot->matched_node); > > kfree(snapshot->name); > kfree(snapshot); > diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h > index de69e2628f2f..de1f82c11bcf 100644 > --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h > +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h > @@ -152,6 +152,7 @@ struct xe_hw_engine { > struct xe_hw_engine_group *hw_engine_group; > }; > > +struct xe_guc_capture_snapshot; > /** > * struct xe_hw_engine_snapshot - Hardware engine snapshot > * > @@ -175,6 +176,13 @@ struct xe_hw_engine_snapshot { > u32 mmio_base; > /** @kernel_reserved: Engine reserved, can't be used by userspace */ > bool kernel_reserved; > + /** > + * @matched_node: GuC Capture snapshot: > + * The matched capture node for the timedout job > + * this single-node tracker works because devcoredump will always only > + * produce one hw-engine capture per devcoredump event > + */ > + struct xe_guc_capture_snapshot *matched_node; > }; > > #endif