Re: [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Thanks for your review Lijo, I will send a patch with reset source in another places.

Regards,
Eric

On 2024-06-04 03:26, Lazar, Lijo wrote:

On 6/3/2024 11:42 PM, Eric Huang wrote:
reset cause is requested by customer as additional
info for gpu reset smi event.

v2: integerate reset sources suggested by Lijo Lazar

Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx>
This series is
	Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>

I think SMI needs to get all reset cause descriptions. Are you planning
to fill reset source at other places also?

Thanks,
Lijo
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  | 10 +++++++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_device.c     |  7 ++++---
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 16 ++++++++++++++--
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  5 ++++-
  6 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index e3738d417245..eb601b41d9d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
  	reset_context.reset_req_dev = adev;
+	reset_context.src = adev->enable_mes ?
+			    AMDGPU_RESET_SRC_MES :
+			    AMDGPU_RESET_SRC_HWS;
  	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..7e945a4790bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
  };
struct amdgpu_device;
+struct amdgpu_reset_context;
enum kfd_mem_attachment_type {
  	KFD_MEM_ATT_SHARED,	/* Share kgd_mem->bo or another attachment's */
@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
+			    struct amdgpu_reset_context *reset_context);
int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
  void kgd2kfd_device_exit(struct kfd_dev *kfd);
  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
-int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+		      struct amdgpu_reset_context *reset_context);
  int kgd2kfd_post_reset(struct kfd_dev *kfd);
  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
@@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
  	return 0;
  }
-static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+				struct amdgpu_reset_context *reset_context)
  {
  	return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6711836054f9..4096cb3e937e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
cancel_delayed_work_sync(&tmp_adev->delayed_init_work); - amdgpu_amdkfd_pre_reset(tmp_adev);
+		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
/*
  		 * Mark these ASICs to be reseted as untracked first
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index fba9b9a258a5..52be4e340fb1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
  	kfree(kfd);
  }
-int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+		      struct amdgpu_reset_context *reset_context)
  {
  	struct kfd_node *node;
  	int i;
@@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) {
  		node = kfd->nodes[i];
-		kfd_smi_event_update_gpu_reset(node, false);
+		kfd_smi_event_update_gpu_reset(node, false, reset_context);
  		node->dqm->ops.pre_reset(node->dqm);
  	}
@@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
  	for (i = 0; i < kfd->num_nodes; i++) {
  		node = kfd->nodes[i];
  		atomic_set(&node->sram_ecc_flag, 0);
-		kfd_smi_event_update_gpu_reset(node, true);
+		kfd_smi_event_update_gpu_reset(node, true, NULL);
  	}
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 06ac835190f9..ea6a8e43bd5b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -29,6 +29,7 @@
  #include "amdgpu_vm.h"
  #include "kfd_priv.h"
  #include "kfd_smi_events.h"
+#include "amdgpu_reset.h"
struct kfd_smi_client {
  	struct list_head list;
@@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
  	add_event_to_kfifo(pid, dev, event, fifo_in, len);
  }
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+				    struct amdgpu_reset_context *reset_context)
  {
  	unsigned int event;
+	char reset_cause[64];
if (post_reset) {
  		event = KFD_SMI_EVENT_GPU_POST_RESET;
@@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
  		event = KFD_SMI_EVENT_GPU_PRE_RESET;
  		++(dev->reset_seq_num);
  	}
-	kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
+
+	memset(reset_cause, 0, sizeof(reset_cause));
+
+	if (reset_context)
+		amdgpu_reset_get_desc(reset_context, reset_cause,
+				      sizeof(reset_cause));
+
+	kfd_smi_event_add(0, dev, event, "%x %s\n",
+			  dev->reset_seq_num,
+			  reset_cause);
  }
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index fa95c2dfd587..85010b8307f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -24,11 +24,14 @@
  #ifndef KFD_SMI_EVENTS_H_INCLUDED
  #define KFD_SMI_EVENTS_H_INCLUDED
+struct amdgpu_reset_context;
+
  int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
  void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
  void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
  					     uint64_t throttle_bitmask);
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset);
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+				    struct amdgpu_reset_context *reset_context);
  void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
  				    unsigned long address, bool write_fault,
  				    ktime_t ts);




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux