On 6/3/2024 11:42 PM, Eric Huang wrote: > reset cause is requested by customer as additional > info for gpu reset smi event. > > v2: integerate reset sources suggested by Lijo Lazar > > Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx> This series is Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> I think SMI needs to get all reset cause descriptions. Are you planning to fill reset source at other places also? Thanks, Lijo > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++++--- > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 16 ++++++++++++++-- > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 5 ++++- > 6 files changed, 33 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index e3738d417245..eb601b41d9d5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) > > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > + reset_context.src = adev->enable_mes ? > + AMDGPU_RESET_SRC_MES : > + AMDGPU_RESET_SRC_HWS; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 1de021ebdd46..7e945a4790bb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE { > }; > > struct amdgpu_device; > +struct amdgpu_reset_context; > > enum kfd_mem_attachment_type { > KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */ > @@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev); > > bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); > > -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev); > +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, > + struct amdgpu_reset_context *reset_context); > > int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); > > @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, > void kgd2kfd_device_exit(struct kfd_dev *kfd); > void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); > int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); > -int kgd2kfd_pre_reset(struct kfd_dev *kfd); > +int kgd2kfd_pre_reset(struct kfd_dev *kfd, > + struct amdgpu_reset_context *reset_context); > int kgd2kfd_post_reset(struct kfd_dev *kfd); > void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); > void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); > @@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) > return 0; > } > > -static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd) > +static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, > + struct amdgpu_reset_context *reset_context) > { > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 6711836054f9..4096cb3e937e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > > cancel_delayed_work_sync(&tmp_adev->delayed_init_work); > > - amdgpu_amdkfd_pre_reset(tmp_adev); > + amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); > > /* > * Mark these ASICs to be reseted as untracked first > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index fba9b9a258a5..52be4e340fb1 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) > kfree(kfd); > } > > -int kgd2kfd_pre_reset(struct kfd_dev *kfd) > +int kgd2kfd_pre_reset(struct kfd_dev *kfd, > + struct amdgpu_reset_context *reset_context) > { > struct kfd_node *node; > int i; > @@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) > > for (i = 0; i < kfd->num_nodes; i++) { > node = kfd->nodes[i]; > - kfd_smi_event_update_gpu_reset(node, false); > + kfd_smi_event_update_gpu_reset(node, false, reset_context); > node->dqm->ops.pre_reset(node->dqm); > } > > @@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) > for (i = 0; i < kfd->num_nodes; i++) { > node = kfd->nodes[i]; > atomic_set(&node->sram_ecc_flag, 0); > - kfd_smi_event_update_gpu_reset(node, true); > + kfd_smi_event_update_gpu_reset(node, true, NULL); > } > > return 0; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > index 06ac835190f9..ea6a8e43bd5b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -29,6 +29,7 @@ > #include "amdgpu_vm.h" > #include "kfd_priv.h" > #include "kfd_smi_events.h" > +#include "amdgpu_reset.h" > > struct kfd_smi_client { > struct list_head list; > @@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, > add_event_to_kfifo(pid, dev, event, fifo_in, len); > } > > -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) > +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, > + struct amdgpu_reset_context *reset_context) > { > unsigned int event; > + char reset_cause[64]; > > if (post_reset) { > event = KFD_SMI_EVENT_GPU_POST_RESET; > @@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) > event = KFD_SMI_EVENT_GPU_PRE_RESET; > ++(dev->reset_seq_num); > } > - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); > + > + memset(reset_cause, 0, sizeof(reset_cause)); > + > + if (reset_context) > + amdgpu_reset_get_desc(reset_context, reset_cause, > + sizeof(reset_cause)); > + > + kfd_smi_event_add(0, dev, event, "%x %s\n", > + dev->reset_seq_num, > + reset_cause); > } > > void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > index fa95c2dfd587..85010b8307f8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -24,11 +24,14 @@ > #ifndef KFD_SMI_EVENTS_H_INCLUDED > #define KFD_SMI_EVENTS_H_INCLUDED > > +struct amdgpu_reset_context; > + > int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); > void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); > void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, > uint64_t throttle_bitmask); > -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); > +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, > + struct amdgpu_reset_context *reset_context); > void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, > unsigned long address, bool write_fault, > ktime_t ts);