Re: [PATCH v6 2/6] drm/xe/guc: Don't store capture nodes in xe_devcoredump_snapshot

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 2025-01-28 at 10:36 -0800, Teres Alexis, Alan Previn wrote:
> GuC-Err-Capture should not be storing register snapshot
> nodes directly inside of the top level xe_devcoredump_snapshot
> structure that it doesn't control. Furthermore, that is
> is not right from a driver subsystem layering perspective.
> 
> 
alan:snip

> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index a99e3160724b..26006d72904f 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -25,6 +25,7 @@
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
>  #include "xe_guc_capture.h"
> +#include "xe_guc_capture_snapshot_types.h"
>  #include "xe_hw_engine_group.h"
>  #include "xe_hw_fence.h"
>  #include "xe_irq.h"
> @@ -867,22 +868,20 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_exec_queue *q)
>                 return snapshot;
>  
>         if (q) {
> -               /* If got guc capture, set source to GuC */
> -               node = xe_guc_capture_get_matching_and_lock(q);
> -               if (node) {
> -                       struct xe_device *xe = gt_to_xe(hwe->gt);
> -                       struct xe_devcoredump *coredump = &xe->devcoredump;
> -
> -                       coredump->snapshot.matched_node = node;
> -                       xe_gt_dbg(hwe->gt, "Found and locked GuC-err-capture node");
> -                       return snapshot;
> +               /* First, retrieve the manual GuC-Error-Capture node if it exists */
> +               node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_MANUAL);
> +               /* Find preferred node type sourced from firmware if available */
> +               snapshot->matched_node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_GUC);
> +               if (!snapshot->matched_node) {
> +                       xe_gt_dbg(hwe->gt, "No fw sourced GuC-Err-Capture for queue %s", q->name);
> +                       snapshot->matched_node = node;
> +               } else if (node) {
> +                       xe_guc_capture_put_matched_nodes(&hwe->gt->uc.guc, node);
>                 }
> +               if (!snapshot->matched_node)
> +                       xe_gt_warn(hwe->gt, "Can't retrieve any GuC-Err-Capture node");
alan: a couple of the CI full-test failures was caused by this. It turns out that
we have other code paths that can attempt to generate a xe_devcoredump without being triggered
from a timed-out-job event. John Harrison fedback that such cases are still valid so this should
be a xe_gt_dbg, not xe_gt_warn. Additionally, we agreed that there is value in reporting
such cases in the dump file. So as opposed to "GuC source" vs "Manual source" engine dumps
we could add additional differentiation Guc-src vs Manual-early vs Manual-late.

Will add that in next rev.


>         }
>  
> -       /* otherwise, do manual capture */
> -       xe_engine_manual_capture(hwe, snapshot);
> -       xe_gt_dbg(hwe->gt, "Proceeding with manual engine snapshot");
> -
>         return snapshot;
>  }
>  
> @@ -900,12 +899,7 @@ void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot)
>                 return;
>  
>         gt = snapshot->hwe->gt;
> -       /*
> -        * xe_guc_capture_put_matched_nodes is called here and from
> -        * xe_devcoredump_snapshot_free, to cover the 2 calling paths
> -        * of hw_engines - debugfs and devcoredump free.
> -        */
> -       xe_guc_capture_put_matched_nodes(&gt->uc.guc);
> +       xe_guc_capture_put_matched_nodes(&gt->uc.guc, snapshot->matched_node);
>  
>         kfree(snapshot->name);
>         kfree(snapshot);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index de69e2628f2f..de1f82c11bcf 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -152,6 +152,7 @@ struct xe_hw_engine {
>         struct xe_hw_engine_group *hw_engine_group;
>  };
>  
> +struct xe_guc_capture_snapshot;
>  /**
>   * struct xe_hw_engine_snapshot - Hardware engine snapshot
>   *
> @@ -175,6 +176,13 @@ struct xe_hw_engine_snapshot {
>         u32 mmio_base;
>         /** @kernel_reserved: Engine reserved, can't be used by userspace */
>         bool kernel_reserved;
> +       /**
> +        * @matched_node: GuC Capture snapshot:
> +        * The matched capture node for the timedout job
> +        * this single-node tracker works because devcoredump will always only
> +        * produce one hw-engine capture per devcoredump event
> +        */
> +       struct xe_guc_capture_snapshot *matched_node;
>  };
>  
>  #endif





[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux