Re: [PATCH v4 3/6] drm/xe/xe_drm_client: Add per drm client pagefault info

Matthew Brost <matthew.brost@xxxxxxxxx> · Tue, 25 Feb 2025 13:10:33 -0800

On Thu, Feb 20, 2025 at 08:38:29PM +0000, Jonathan Cavitt wrote:
> Add additional information to drm client so it can report up to the last
> 50 exec queues to have been banned on it, as well as the last pagefault
> seen when said exec queues were banned.  Since we cannot reasonably
> associate a pagefault to a specific exec queue, we currently report the
> last seen pagefault on the associated hw engine instead.
> 
> The last pagefault seen per exec queue is saved to the hw engine, and the
> pagefault is updated during the pagefault handling process in
> xe_gt_pagefault.  The last seen pagefault is reset when the engine is
> reset because any future exec queue bans likely were not caused by said
> pagefault after the reset.
> 
> v2: Remove exec queue from blame list on destroy and recreate (Joonas)
> v3: Do not print as part of xe_drm_client_fdinfo (Joonas)
> v4: Fix formatting and kzalloc during lock warnings
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@xxxxxxxxx>
> ---
>  drivers/gpu/drm/xe/xe_drm_client.c      | 68 +++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_drm_client.h      | 42 +++++++++++++++
>  drivers/gpu/drm/xe/xe_exec_queue.c      |  7 +++
>  drivers/gpu/drm/xe/xe_gt_pagefault.c    | 17 +++++++
>  drivers/gpu/drm/xe/xe_guc_submit.c      | 15 ++++++
>  drivers/gpu/drm/xe/xe_hw_engine.c       |  4 ++
>  drivers/gpu/drm/xe/xe_hw_engine_types.h |  8 +++
>  7 files changed, 161 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index 2d4874d2b922..1bc978ae4c2f 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -17,6 +17,7 @@
>  #include "xe_exec_queue.h"
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_hw_engine.h"
>  #include "xe_pm.h"
>  #include "xe_trace.h"
> @@ -97,6 +98,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
>  #ifdef CONFIG_PROC_FS
>  	spin_lock_init(&client->bos_lock);
>  	INIT_LIST_HEAD(&client->bos_list);
> +	spin_lock_init(&client->blame_lock);
> +	INIT_LIST_HEAD(&client->blame_list);
>  #endif
>  	return client;
>  }
> @@ -164,6 +167,71 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
>  	xe_drm_client_put(client);
>  }
>  
> +static void free_blame(struct blame *b)
> +{
> +	list_del(&b->list);
> +	kfree(b->pf);
> +	kfree(b);
> +}
> +
> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> +			     struct xe_exec_queue *q)
> +{
> +	struct blame *b = NULL;
> +	struct pagefault *pf = NULL;
> +	struct xe_file *xef = q->xef;
> +	struct xe_hw_engine *hwe = q->hwe;
> +
> +	b = kzalloc(sizeof(*b), GFP_KERNEL);
> +	xe_assert(xef->xe, b);
> +
> +	spin_lock(&client->blame_lock);
> +	list_add_tail(&b->list, &client->blame_list);
> +	client->blame_len++;
> +	/**
> +	 * Limit the number of blames in the blame list to prevent memory overuse.
> +	 */
> +	if (client->blame_len > MAX_BLAME_LEN) {
> +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
> +
> +		free_blame(rem);
> +		client->blame_len--;
> +	}
> +	spin_unlock(&client->blame_lock);
> +
> +	/**
> +	 * Duplicate pagefault on engine to blame, if one may have caused the
> +	 * exec queue to be ban.
> +	 */
> +	b->pf = NULL;
> +	pf = kzalloc(sizeof(*pf), GFP_KERNEL);
> +	spin_lock(&hwe->pf.lock);
> +	if (hwe->pf.info) {
> +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
> +		b->pf = pf;
> +	} else {
> +		kfree(pf);
> +	}
> +	spin_unlock(&hwe->pf.lock);
> +
> +	/** Save blame data to list element */
> +	b->exec_queue_id = q->id;
> +}
> +
> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +				struct xe_exec_queue *q)
> +{
> +	struct blame *b, *tmp;
> +
> +	spin_lock(&client->blame_lock);
> +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
> +		if (b->exec_queue_id == q->id) {
> +			free_blame(b);
> +			client->blame_len--;
> +		}
> +	spin_unlock(&client->blame_lock);
> +}
> +
>  static void bo_meminfo(struct xe_bo *bo,
>  		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
>  {
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> index a9649aa36011..b3d9b279d55f 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.h
> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> @@ -13,9 +13,22 @@
>  #include <linux/sched.h>
>  #include <linux/spinlock.h>
>  
> +#define MAX_BLAME_LEN	50
> +
>  struct drm_file;
>  struct drm_printer;
> +struct pagefault;
>  struct xe_bo;
> +struct xe_exec_queue;
> +
> +struct blame {

blame is not a great name here. How about xe_exec_queue_ban_entry?

> +	/** @exec_queue_id: ID number of banned exec queue */
> +	u32 exec_queue_id;
> +	/** @pf: pagefault on engine of banned exec queue, if any at time */
> +	struct pagefault *pf;

Maybe just embedded the 'struct pagefault' within this structure. I
understand a page fault may or may not be attached here but unsure if it
worth the extra allocation / free to save a little bit of memory. I
don't have strong opinion here, so take it as a suggestion.

> +	/** @list: link into @xe_drm_client.blame_list */
> +	struct list_head list;
> +};
>  
>  struct xe_drm_client {
>  	struct kref kref;
> @@ -31,6 +44,21 @@ struct xe_drm_client {
>  	 * Protected by @bos_lock.
>  	 */
>  	struct list_head bos_list;
> +	/**
> +	 * @blame_lock: lock protecting @blame_list
> +	 */
> +	spinlock_t blame_lock;

Again blame is bad name. See my comment above and adjust the naming
appropriately.

Also I'd use a scoped structure here. e.g.

struct {
	spinlock_t lock;
	struct list_head list;
	unsigned int len;
} blame;

> +	/**
> +	 * @blame_list: list of banned exec queues associated with this drm
> +	 *		client, as well as any pagefaults at time of ban.
> +	 *
> +	 * Protected by @blame_lock;
> +	 */
> +	struct list_head blame_list;
> +	/**
> +	 * @blame_len: length of @blame_list
> +	 */
> +	unsigned int blame_len;
>  #endif
>  };
>  
> @@ -57,6 +85,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
>  void xe_drm_client_add_bo(struct xe_drm_client *client,
>  			  struct xe_bo *bo);
>  void xe_drm_client_remove_bo(struct xe_bo *bo);
> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> +			     struct xe_exec_queue *q);
> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +				struct xe_exec_queue *q);
>  #else
>  static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>  					struct xe_bo *bo)
> @@ -66,5 +98,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>  static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
>  {
>  }
> +
> +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
> +					   struct xe_exec_queue *q)
> +{
> +}
> +
> +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +					      struct xe_exec_queue *q)
> +{
> +}
>  #endif
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 4a98a5d0e405..f8bcf43b2a0e 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -13,6 +13,7 @@
>  #include <uapi/drm/xe_drm.h>
>  
>  #include "xe_device.h"
> +#include "xe_drm_client.h"
>  #include "xe_gt.h"
>  #include "xe_hw_engine_class_sysfs.h"
>  #include "xe_hw_engine_group.h"
> @@ -712,6 +713,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>  	q->id = id;
>  	args->exec_queue_id = id;
>  
> +	/**
> +	 * If an exec queue in the blame list shares the same exec queue
> +	 * ID, remove it from the blame list to avoid confusion.
> +	 */
> +	xe_drm_client_remove_blame(q->xef->client, q);
> +
>  	return 0;
>  
>  kill_exec_queue:
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index fe18e3ec488a..b95501076569 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -330,6 +330,21 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	return full ? -ENOSPC : 0;
>  }
>  
> +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
> +{
> +	struct xe_hw_engine *hwe;
> +

This doesn't work with virtual exec queues - i.e. exec queues that have
logical mask of more than 1 bit set. In this case the q->hwe is a
pointer to a hwe in logical mask but not necessarily where it is
running - the GuC controls that. With that, the best we can do here is
record the fault per hwe class and then try to associated it with next
exec queue banned.

> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> +	if (hwe) {
> +		spin_lock(&hwe->pf.lock);
> +		/** Info initializes as NULL, so alloc if first pagefault */
> +		if (!hwe->pf.info)
> +			hwe->pf.info = kzalloc(sizeof(*pf), GFP_KERNEL);
> +		memcpy(hwe->pf.info, pf, sizeof(*pf));
> +		spin_unlock(&hwe->pf.lock);
> +	}
> +}
> +
>  #define USM_QUEUE_MAX_RUNTIME_MS	20
>  
>  static void pf_queue_work_func(struct work_struct *w)
> @@ -352,6 +367,8 @@ static void pf_queue_work_func(struct work_struct *w)
>  			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
>  		}
>  
> +		save_pagefault_to_engine(gt, &pf);
> +

I think we really only want to save this upon a fault failing, right?
Either the VM is not in fault more or for some reason page fault service
fails.

>  		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
>  			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
>  			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 913c74d6e2ae..92de926bd505 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -20,11 +20,13 @@
>  #include "xe_assert.h"
>  #include "xe_devcoredump.h"
>  #include "xe_device.h"
> +#include "xe_drm_client.h"
>  #include "xe_exec_queue.h"
>  #include "xe_force_wake.h"
>  #include "xe_gpu_scheduler.h"
>  #include "xe_gt.h"
>  #include "xe_gt_clock.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_gt_printk.h"
>  #include "xe_guc.h"
>  #include "xe_guc_capture.h"
> @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
>  static void set_exec_queue_banned(struct xe_exec_queue *q)
>  {
>  	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
> +	xe_drm_client_add_blame(q->xef->client, q);

I think this is not the correct place to do this, rather in
xe_guc_exec_queue_memory_cat_error_handler. With above, when we NACK a
page fault it will immediately followed by a
XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR G2H. Assuming we only want to
capture bad page faults, not ones serviced normally, my suggestion here
seems like the way to go.

For an example of the above follow in IGT, run xe_exec_reset --r
cat-error:

[12725.778776] [IGT] xe_exec_reset: starting subtest cat-error
[12725.783727] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]]
                ASID: 4368
                VFID: 0
                PDATA: 0x0490
                Faulted Address: 0x00000000002a0000
                FaultType: 0
                AccessType: 0
                FaultLevel: 1
                EngineClass: 0 rcs
                EngineInstance: 0
[12725.783760] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault response: Unsuccessful -22
[12725.783942] xe 0000:03:00.0: [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT0: Engine memory cat error: engine_class=rcs, logical_mask: 0x1, guc_id=2
[12725.784528] xe 0000:03:00.0: [drm:xe_hw_engine_snapshot_capture [xe]] GT0: Proceeding with manual engine snapshot
[12725.787350] xe 0000:03:00.0: [drm] GT0: Engine reset: engine_class=rcs, logical_mask: 0x1, guc_id=2
[12725.787377] xe 0000:03:00.0: [drm] GT0: Timedout job: seqno=4294967169, lrc_seqno=4294967169, guc_id=2, flags=0x0 in xe_exec_reset [26019]
[12725.815980] xe 0000:03:00.0: [drm] Xe device coredump has been created

>  }
>  
>  static bool exec_queue_suspended(struct xe_exec_queue *q)
> @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
> +	struct xe_hw_engine *hwe;
>  	struct xe_exec_queue *q;
>  	u32 guc_id;
>  
> @@ -1983,11 +1987,22 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	if (unlikely(!q))
>  		return -EPROTO;
>  
> +	hwe = q->hwe;
> +
>  	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>  		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>  
>  	trace_xe_exec_queue_reset(q);
>  
> +	/**
> +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
> +	 * not caused by said pagefault now that the engine has reset.
> +	 */
> +	spin_lock(&hwe->pf.lock);
> +	kfree(hwe->pf.info);
> +	hwe->pf.info = NULL;
> +	spin_unlock(&hwe->pf.lock);

I'd clear this cache when associated a PF with an exec queue.

Matt

> +
>  	/*
>  	 * A banned engine is a NOP at this point (came from
>  	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index fc447751fe78..69f61e4905e2 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -21,6 +21,7 @@
>  #include "xe_gsc.h"
>  #include "xe_gt.h"
>  #include "xe_gt_ccs_mode.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_gt_printk.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
> @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  		hwe->eclass->defaults = hwe->eclass->sched_props;
>  	}
>  
> +	hwe->pf.info = NULL;
> +	spin_lock_init(&hwe->pf.lock);
> +
>  	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
>  	xe_tuning_process_engine(hwe);
>  	xe_wa_process_engine(hwe);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index e4191a7a2c31..2e1be9481d9b 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
>  struct xe_bo;
>  struct xe_execlist_port;
>  struct xe_gt;
> +struct pagefault;
>  
>  /**
>   * struct xe_hw_engine_class_intf - per hw engine class struct interface
> @@ -150,6 +151,13 @@ struct xe_hw_engine {
>  	struct xe_oa_unit *oa_unit;
>  	/** @hw_engine_group: the group of hw engines this one belongs to */
>  	struct xe_hw_engine_group *hw_engine_group;
> +	/** @pf: the last pagefault seen on this engine */
> +	struct {
> +		/** @pf.info: info containing last seen pagefault details */
> +		struct pagefault *info;
> +		/** @pf.lock: lock protecting @pf.info */
> +		spinlock_t lock;
> +	} pf;
>  };
>  
>  enum xe_hw_engine_snapshot_source_id {
> -- 
> 2.43.0
>