*/
@@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
#ifdef CONFIG_PROC_FS
spin_lock_init(&client->bos_lock);
INIT_LIST_HEAD(&client->bos_list);
+ spin_lock_init(&client->blame_lock);
+ INIT_LIST_HEAD(&client->blame_list);
#endif
return client;
}
@@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
xe_drm_client_put(client);
}
+static void free_blame(struct blame *b)
+{
+ list_del(&b->list);
+ kfree(b->pf);
+ kfree(b);
+}
+
+void xe_drm_client_add_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q)
+{
+ struct blame *b = NULL;
+ struct list_head *h;
+ struct pagefault *pf = NULL;
+ struct xe_file *xef = q->xef;
+ struct xe_hw_engine *hwe = q->hwe;
+ unsigned long count;
+
+ b = kzalloc(sizeof(struct blame), GFP_KERNEL);
+ xe_assert(xef->xe, b);
+
+ spin_lock(&client->blame_lock);
+ list_add_tail(&b->list, &client->blame_list);
+ /**
+ * Limit the number of blames in the blame list to prevent memory overuse.
+ *
+ * TODO: Parameterize max blame list size.
+ */
+ count = 0;
+ list_for_each(h, &client->blame_list)
+ count++;
+ if (count >= 50) {
+ struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
+ free_blame(rem);
+ }
+ spin_unlock(&client->blame_lock);
+
+ /**
+ * Duplicate pagefault on engine to blame, if one may have caused the
+ * exec queue to be banned.
+ */
+ b->pf = NULL;
+ spin_lock(&hwe->pf.lock);
+ if (hwe->pf.info) {
+ pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+ memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
+ }
+ spin_unlock(&hwe->pf.lock);
+
+ /** Save blame data to list element */
+ b->exec_queue_id = q->id;
+ b->pf = pf;
+}
+
+void xe_drm_client_remove_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q)
+{
+ struct blame *b, *tmp;
+
+ spin_lock(&client->blame_lock);
+ list_for_each_entry_safe(b, tmp, &client->blame_list, list)
+ if (b->exec_queue_id == q->id)
+ free_blame(b);
+ spin_unlock(&client->blame_lock);
+
+}
+
static void bo_meminfo(struct xe_bo *bo,
struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
{
@@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
}
}
+static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
+{
+ drm_printf(p, "\n\t\tASID: %d\n"
+ "\t\tVFID: %d\n"
+ "\t\tPDATA: 0x%04x\n"
+ "\t\tFaulted Address: 0x%08x%08x\n"
+ "\t\tFaultType: %s\n"
+ "\t\tAccessType: %s\n"
+ "\t\tFaultLevel: %d\n"
+ "\t\tEngineClass: %d %s\n"
+ "\t\tEngineInstance: %d\n",
+ pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
+ lower_32_bits(pf->page_addr),
+ fault_type_to_str(pf->fault_type),
+ access_type_to_str(pf->access_type),
+ pf->fault_level, pf->engine_class,
+ xe_hw_engine_class_to_str(pf->engine_class),
+ pf->engine_instance);
+}
+
+static void show_blames(struct drm_printer *p, struct drm_file *file)
+{
+ struct xe_file *xef = file->driver_priv;
+ struct xe_drm_client *client;
+ struct blame *b;
+
+ client = xef->client;
+
+ drm_printf(p, "\n");
+ drm_printf(p, "- Exec queue ban list -\n");
+ spin_lock(&client->blame_lock);
+ list_for_each_entry(b, &client->blame_list, list) {
+ struct pagefault *pf = b->pf;
+ drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
+ drm_printf(p, "\t\tAssociated pagefault:\n");
+ if (pf)
+ print_pagefault(p, pf);
+ else
+ drm_printf(p, "\t\t- No associated pagefault -\n");
+ }
+ spin_unlock(&client->blame_lock);
+}
+
/**
* xe_drm_client_fdinfo() - Callback for fdinfo interface
* @p: The drm_printer ptr
@@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
{
show_meminfo(p, file);
show_run_ticks(p, file);
+ show_blames(p, file);
}
#endif
diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
index a9649aa36011..d21fd0b90742 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.h
+++ b/drivers/gpu/drm/xe/xe_drm_client.h
@@ -15,7 +15,18 @@
struct drm_file;
struct drm_printer;
+struct pagefault;
struct xe_bo;
+struct xe_exec_queue;
+
+struct blame {
+ /** @exec_queue_id: ID number of banned exec queue */
+ u32 exec_queue_id;
+ /** @pf: pagefault on engine of banned exec queue, if any at time */
+ struct pagefault *pf;
+ /** @list: link into @xe_drm_client.blame_list */
+ struct list_head list;
+};
struct xe_drm_client {
struct kref kref;
@@ -31,6 +42,17 @@ struct xe_drm_client {
* Protected by @bos_lock.
*/
struct list_head bos_list;
+ /**
+ * @blame_lock: lock protecting @blame_list
+ */
+ spinlock_t blame_lock;
+ /**
+ * @blame_list: list of banned exec queues associated with this drm
+ * client, as well as any pagefaults at time of ban.
+ *
+ * Protected by @blame_lock;
+ */
+ struct list_head blame_list;
#endif
};
@@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
void xe_drm_client_add_bo(struct xe_drm_client *client,
struct xe_bo *bo);
void xe_drm_client_remove_bo(struct xe_bo *bo);
+void xe_drm_client_add_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q);
+void xe_drm_client_remove_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q);
#else
static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
struct xe_bo *bo)
@@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
{
}
+
+static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q)
+{
+}
+
+static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
+ struct xe_exec_queue *q)
+{
+}
#endif
#endif
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index a02e62465e01..9c9bc617020c 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -13,6 +13,7 @@
#include <uapi/drm/xe_drm.h>
#include "xe_device.h"
+#include "xe_drm_client.h"
#include "xe_gt.h"
#include "xe_hw_engine_class_sysfs.h"
#include "xe_hw_engine_group.h"
@@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
q->id = id;
args->exec_queue_id = id;
+ /**
+ * If an exec queue in the blame list shares the same exec queue
+ * ID, remove it from the blame list to avoid confusion.
+ */
+ xe_drm_client_remove_blame(q->xef->client, q);
+
return 0;
kill_exec_queue:
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index fe18e3ec488a..a0e6f2281e37 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
return full ? -ENOSPC : 0;
}
+static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
+{
+ struct xe_hw_engine *hwe;
+
+ hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
+ if (hwe) {
+ spin_lock(&hwe->pf.lock);
+ /** The latest pagefault is pf, so remove old pf info from engine */
+ if (hwe->pf.info)
+ kfree(hwe->pf.info);
+ hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+ if (hwe->pf.info)
+ memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
+ spin_unlock(&hwe->pf.lock);
+ }
+}
+
#define USM_QUEUE_MAX_RUNTIME_MS 20
static void pf_queue_work_func(struct work_struct *w)
@@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
}
+ save_pagefault_to_engine(gt, &pf);
+
reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 913c74d6e2ae..d9da5c89429e 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -20,11 +20,13 @@
#include "xe_assert.h"
#include "xe_devcoredump.h"
#include "xe_device.h"
+#include "xe_drm_client.h"
#include "xe_exec_queue.h"
#include "xe_force_wake.h"
#include "xe_gpu_scheduler.h"
#include "xe_gt.h"
#include "xe_gt_clock.h"
+#include "xe_gt_pagefault.h"
#include "xe_gt_printk.h"
#include "xe_guc.h"
#include "xe_guc_capture.h"
@@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
static void set_exec_queue_banned(struct xe_exec_queue *q)
{
atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
+ xe_drm_client_add_blame(q->xef->client, q);
}
static bool exec_queue_suspended(struct xe_exec_queue *q)
@@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
{
struct xe_gt *gt = guc_to_gt(guc);
+ struct xe_hw_engine *hwe;
struct xe_exec_queue *q;
u32 guc_id;
@@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
if (unlikely(!q))
return -EPROTO;
+ hwe = q->hwe;
+
xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
trace_xe_exec_queue_reset(q);
+ /**
+ * Clear last pagefault from engine. Any future exec queue bans likely were
+ * not caused by said pagefault now that the engine has reset.
+ */
+ spin_lock(&hwe->pf.lock);
+ if (hwe->pf.info) {
+ kfree(hwe->pf.info);
+ hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+ }
+ spin_unlock(&hwe->pf.lock);
+
/*
* A banned engine is a NOP at this point (came from
* guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
index fc447751fe78..69f61e4905e2 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine.c
+++ b/drivers/gpu/drm/xe/xe_hw_engine.c
@@ -21,6 +21,7 @@
#include "xe_gsc.h"
#include "xe_gt.h"
#include "xe_gt_ccs_mode.h"
+#include "xe_gt_pagefault.h"
#include "xe_gt_printk.h"
#include "xe_gt_mcr.h"
#include "xe_gt_topology.h"
@@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
hwe->eclass->defaults = hwe->eclass->sched_props;
}
+ hwe->pf.info = NULL;
+ spin_lock_init(&hwe->pf.lock);
+
xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
xe_tuning_process_engine(hwe);
xe_wa_process_engine(hwe);
diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
index e4191a7a2c31..2e1be9481d9b 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
+++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
@@ -64,6 +64,7 @@ enum xe_hw_engine_id {
struct xe_bo;
struct xe_execlist_port;
struct xe_gt;
+struct pagefault;
/**
* struct xe_hw_engine_class_intf - per hw engine class struct interface
@@ -150,6 +151,13 @@ struct xe_hw_engine {
struct xe_oa_unit *oa_unit;
/** @hw_engine_group: the group of hw engines this one belongs to */
struct xe_hw_engine_group *hw_engine_group;
+ /** @pf: the last pagefault seen on this engine */
+ struct {
+ /** @pf.info: info containing last seen pagefault details */
+ struct pagefault *info;
+ /** @pf.lock: lock protecting @pf.info */
+ spinlock_t lock;
+ } pf;
};
enum xe_hw_engine_snapshot_source_id {
--
2.43.0