This patch adds 'ring hang' events to the driver. This is done by adding a 'reset_ring_hang' bool variable to the struct 'amdgpu_reset_context' in the amdgpu_reset.h file. The purpose for this 'reset_ring_hang' variable is whenever a GPU reset is initiated due to a ring hang, the reset_ring_hang should be set to 'true'. Additionally, the reset cause is passed into the kfd smi event as a string, and is displayed in dmesg as an error. This 'amdgpu_reset_context' struct is now also passed through across all relevant functions, and another event type "KFD_SMI_EVENT_RING_HANG" is added to the kfd_smi_event enum. Signed-off-by: Ori Messinger <Ori.Messinger@xxxxxxx> Change-Id: I6af3022eb1b4514201c9430d635ff87f167ad6f7 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 +++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 9 ++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 ++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++++--- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 7 ++++++- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 5 ++++- include/uapi/linux/kfd_ioctl.h | 15 ++++++++------- 10 files changed, 56 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index e3738d417245..f1c6dc939cc3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; + reset_context.reset_ring_hang = true; + strscpy(reset_context.reset_cause, "hws_hang", sizeof(reset_context.reset_cause)); + DRM_ERROR("Reset cause: %s\n", reset_context.reset_cause); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); @@ -261,12 +264,12 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm) return r; } -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev) +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { int r = 0; if (adev->kfd.dev) - r = kgd2kfd_pre_reset(adev->kfd.dev); + r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 1de021ebdd46..c9030d8b8308 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE { }; struct amdgpu_device; +struct amdgpu_reset_context; enum kfd_mem_attachment_type { KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */ @@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev); bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev); +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, + struct amdgpu_reset_context *reset_context); int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd); void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); -int kgd2kfd_pre_reset(struct kfd_dev *kfd); +int kgd2kfd_pre_reset(struct kfd_dev *kfd, + struct amdgpu_reset_context *reset_context); int kgd2kfd_post_reset(struct kfd_dev *kfd); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); @@ -459,7 +462,7 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) return 0; } -static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd) +static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, struct amdgpu_reset_context *reset_context) { return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 00fe3c2d5431..b18f37426b5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5772,7 +5772,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, cancel_delayed_work_sync(&tmp_adev->delayed_init_work); - amdgpu_amdkfd_pre_reset(tmp_adev); + amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); /* * Mark these ASICs to be reseted as untracked first diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e4742b65032d..c3e32f21aa49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -77,6 +77,22 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; + reset_context.reset_ring_hang = true; + if (ring->name) { + /* Ensure buffer length of 64 is not exceeded during copy of ring->name */ + size_t name_length; + + for (name_length = 0; name_length < sizeof(reset_context.reset_cause) - 6 && + ring->name[name_length] != '\0'; name_length++) { + reset_context.reset_cause[name_length] = ring->name[name_length]; + } + strscpy(reset_context.reset_cause + name_length, "_hang", + sizeof(reset_context.reset_cause) - name_length); + } else { + strscpy(reset_context.reset_cause, "unknown_hang", + sizeof(reset_context.reset_cause)); + } + DRM_ERROR("Reset cause: %s\n", reset_context.reset_cause); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1dd13ed3b7b5..e2d65c5c17c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2479,6 +2479,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; + reset_context.reset_ring_hang = true; + strscpy(reset_context.reset_cause, "ras_hang", sizeof(reset_context.reset_cause)); + DRM_ERROR("Reset cause: %s\n", reset_context.reset_cause); + /* Perform full reset in fatal error mode */ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 5a9cc043b858..757284ab36e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -43,6 +43,8 @@ struct amdgpu_reset_context { struct amdgpu_hive_info *hive; struct list_head *reset_device_list; unsigned long flags; + bool reset_ring_hang; + char reset_cause[64]; }; struct amdgpu_reset_handler { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 6b15e55811b6..88171f24496b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -35,6 +35,7 @@ #include "kfd_migrate.h" #include "amdgpu.h" #include "amdgpu_xcp.h" +#include "amdgpu_reset.h" #define MQD_SIZE_ALIGNED 768 @@ -931,7 +932,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfree(kfd); } -int kgd2kfd_pre_reset(struct kfd_dev *kfd) +int kgd2kfd_pre_reset(struct kfd_dev *kfd, struct amdgpu_reset_context *reset_context) { struct kfd_node *node; int i; @@ -941,7 +942,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; - kfd_smi_event_update_gpu_reset(node, false); + kfd_smi_event_update_gpu_reset(node, false, reset_context); node->dqm->ops.pre_reset(node->dqm); } @@ -981,7 +982,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; atomic_set(&node->sram_ecc_flag, 0); - kfd_smi_event_update_gpu_reset(node, true); + kfd_smi_event_update_gpu_reset(node, true, NULL); } return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 06ac835190f9..3ffe4b61fe4f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -29,6 +29,7 @@ #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" +#include "amdgpu_reset.h" struct kfd_smi_client { struct list_head list; @@ -215,7 +216,8 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, add_event_to_kfifo(pid, dev, event, fifo_in, len); } -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context) { unsigned int event; @@ -224,6 +226,9 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) } else { event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); + if (reset_context && reset_context->reset_ring_hang) + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_RING_HANG, "%s\n", + reset_context->reset_cause); } kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index fa95c2dfd587..85010b8307f8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -24,11 +24,14 @@ #ifndef KFD_SMI_EVENTS_H_INCLUDED #define KFD_SMI_EVENTS_H_INCLUDED +struct amdgpu_reset_context; + int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask); -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context); void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, unsigned long address, bool write_fault, ktime_t ts); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 285a36601dc9..7c94d2c7da13 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -519,13 +519,14 @@ enum kfd_smi_event { KFD_SMI_EVENT_THERMAL_THROTTLE = 2, KFD_SMI_EVENT_GPU_PRE_RESET = 3, KFD_SMI_EVENT_GPU_POST_RESET = 4, - KFD_SMI_EVENT_MIGRATE_START = 5, - KFD_SMI_EVENT_MIGRATE_END = 6, - KFD_SMI_EVENT_PAGE_FAULT_START = 7, - KFD_SMI_EVENT_PAGE_FAULT_END = 8, - KFD_SMI_EVENT_QUEUE_EVICTION = 9, - KFD_SMI_EVENT_QUEUE_RESTORE = 10, - KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_RING_HANG = 5, + KFD_SMI_EVENT_MIGRATE_START = 6, + KFD_SMI_EVENT_MIGRATE_END = 7, + KFD_SMI_EVENT_PAGE_FAULT_START = 8, + KFD_SMI_EVENT_PAGE_FAULT_END = 9, + KFD_SMI_EVENT_QUEUE_EVICTION = 10, + KFD_SMI_EVENT_QUEUE_RESTORE = 11, + KFD_SMI_EVENT_UNMAP_FROM_GPU = 12, /* * max event number, as a flag bit to get events from all processes, -- 2.25.1