Am 2020-08-28 um 1:53 p.m. schrieb Mukul Joshi: > Add support for reporting GPU reset events through SMI. KFD > would report both pre and post GPU reset events. > > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> Minor coding-style nit-picks inline. With those fixed, this patch is Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> And I noticed a coding style issue from a previous commit that could be cleaned up in a follow up. See inline. > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 5 +++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 35 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + > include/uapi/linux/kfd_ioctl.h | 2 ++ > 5 files changed, 43 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index e1cd6599529f..f5e1b3aaa10c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) > if (!kfd->init_complete) > return 0; > > + kfd_smi_event_update_gpu_reset(kfd, false); > + > kfd->dqm->ops.pre_reset(kfd->dqm); > > kgd2kfd_suspend(kfd, false); > @@ -833,6 +835,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) > if (!kfd->init_complete) > return 0; > > + Unnecessary white-space change. > ret = kfd_resume(kfd); > if (ret) > return ret; > @@ -840,6 +843,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) > > atomic_set(&kfd->sram_ecc_flag, 0); > > + kfd_smi_event_update_gpu_reset(kfd, true); > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 18bc711f97ae..7e8767934748 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -312,6 +312,8 @@ struct kfd_dev { > /* Clients watching SMI events */ > struct list_head smi_clients; > spinlock_t smi_lock; > + > + uint32_t reset_seq_num; > }; > > enum kfd_mempool { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > index 4d4b6e3ab697..001cacb09467 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -174,6 +174,37 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, > rcu_read_unlock(); > } > > +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) > +{ > + /* > + * GpuReset msg = Reset seq number (incremented for > + * every reset message sent before GPU reset). > + * 1 byte event + 1 byte space + 8 bytes seq num + > + * 1 byte \n + 1 byte \0 = 12 > + */ > + char fifo_in[12]; > + int len; > + unsigned int event; > + > + if (list_empty(&dev->smi_clients)) { > + return; > + } You don't need {} for a single statement. > + > + memset(fifo_in, 0x0, sizeof(fifo_in)); > + > + if (post_reset) { The {} are OK (and recommended) here for consistency with the else-branch. > + event = KFD_SMI_EVENT_GPU_POST_RESET; > + } else { > + event = KFD_SMI_EVENT_GPU_PRE_RESET; > + ++(dev->reset_seq_num); > + } > + > + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event, > + dev->reset_seq_num); > + > + add_event_to_kfifo(dev, event, fifo_in, len); > +} > + > void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > uint32_t throttle_bitmask) > { > @@ -191,7 +222,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > if (list_empty(&dev->smi_clients)) > return; > > - len = snprintf(fifo_in, 29, "%x %x:%llx\n", > + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n", > KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, > atomic64_read(&adev->smu.throttle_int_counter)); > > @@ -218,7 +249,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) > if (!task_info.pid) > return; > > - len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, > + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, > task_info.pid, task_info.task_name); > > add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > index 15537b2cccb5..b9b0438202e2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); > void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); > void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > uint32_t throttle_bitmask); > +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); > > #endif > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index cb1f963a84e0..8b7368bfbd84 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -453,6 +453,8 @@ enum kfd_smi_event { > KFD_SMI_EVENT_NONE = 0, /* not used */ > KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ > KFD_SMI_EVENT_THERMAL_THROTTLE = 2, Looks like a previous commit didn't use TABs for indentation here. Can you make a fix for that? Thanks, Felix > + KFD_SMI_EVENT_GPU_PRE_RESET = 3, > + KFD_SMI_EVENT_GPU_POST_RESET = 4, > }; > > #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx