Am 2020-07-21 um 5:01 p.m. schrieb Mukul Joshi: > Add support for reporting thermal throttling events through SMI. > Also, add a counter to count the number of throttling interrupts > observed and report the count in the SMI event message. > > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 70 ++++++++++++++----- > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 2 + > drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + > drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 1 + > .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + > drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++ > include/uapi/linux/kfd_ioctl.h | 1 + > 10 files changed, 77 insertions(+), 17 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 1b865fed74ca..19e4658756d8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) > void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) > { > } > + > +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) > +{ > +} > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 3f2b695cf19e..e8b0258aae24 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm); > int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, > struct dma_fence *fence); > void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); > +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask); > > #endif /* AMDGPU_AMDKFD_H_INCLUDED */ > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 4bfedaab183f..d5e790f046b4 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -29,6 +29,7 @@ > #include "cwsr_trap_handler.h" > #include "kfd_iommu.h" > #include "amdgpu_amdkfd.h" > +#include "kfd_smi_events.h" > > #define MQD_SIZE_ALIGNED 768 > > @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) > WARN_ONCE(count < 0, "Compute profile ref. count error"); > } > > +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) > +{ > + if (kfd) > + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); > +} > + > #if defined(CONFIG_DEBUG_FS) > > /* This function will send a package to HIQ to hang the HWS > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > index 7b348bf9df21..247538bccba2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -24,6 +24,7 @@ > #include <linux/wait.h> > #include <linux/anon_inodes.h> > #include <uapi/linux/kfd_ioctl.h> > +#include "amdgpu.h" > #include "amdgpu_vm.h" > #include "kfd_priv.h" > #include "kfd_smi_events.h" > @@ -148,6 +149,56 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) > return 0; > } > > +static int add_event_to_kfifo(struct kfd_dev *dev, long long smi_event, > + char *event_msg, int len) > +{ > + struct kfd_smi_client *client; > + int ret = 0; > + > + rcu_read_lock(); > + > + list_for_each_entry_rcu(client, &dev->smi_clients, list) { > + if (!(READ_ONCE(client->events) & smi_event)) > + continue; > + spin_lock(&client->lock); > + if (kfifo_avail(&client->fifo) >= len) { > + kfifo_in(&client->fifo, event_msg, len); > + wake_up_all(&client->wait_queue); > + } > + else "else" should be on the same line as the closing brace. Also, if one if-else branch uses braces, all branches should for readability and maintainability. > + ret = -ENOSPC; > + spin_unlock(&client->lock); > + } > + > + rcu_read_unlock(); > + > + return ret; > +} > + > +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > + uint32_t throttle_bitmask) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; > + /* > + * ThermalThrottle msg = gpu_id(4):thermal_interrupt_count(4): > + * throttle_bitmask(4) > + * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : + > + * 4 byte thermal_interupt_counter + 1 byte : + The interrupt counter can be more than 4 bytes. It's a 32bit counter (or maybe 64bit, see below). > + * 4 byte throttle_bitmask + 1 byte \n = 32 > + */ > + char fifo_in[32]; > + int len; > + > + if (list_empty(&dev->smi_clients)) > + return; > + > + len = snprintf(fifo_in, 32, "%x %d:%d:%x\n", KFD_SMI_EVENT_THERMAL, > + dev->id, READ_ONCE(adev->smu.throttle_int_counter), throttle_bitmask); I'd recommend using hexadecimal for everything. It's more compact and avoids mistakes parsing it in user mode. > + > + if (add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL, fifo_in, len)) > + pr_debug("smi_event(throttle): no space left\n"); I'd move the message into add_event_to_kfifo to avoid duplicating it in multiple functions and to avoid guessing the cause of the problem (no space left). > +} > + > void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) > { > struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; > @@ -156,7 +207,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) > /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 > */ > char fifo_in[43]; > - struct kfd_smi_client *client; > int len; > > if (list_empty(&dev->smi_clients)) > @@ -171,22 +221,8 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) > len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, > task_info.pid, task_info.task_name); > > - rcu_read_lock(); > - > - list_for_each_entry_rcu(client, &dev->smi_clients, list) { > - if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT)) > - continue; > - spin_lock(&client->lock); > - if (kfifo_avail(&client->fifo) >= len) { > - kfifo_in(&client->fifo, fifo_in, len); > - wake_up_all(&client->wait_queue); > - } > - else > - pr_debug("smi_event(vmfault): no space left\n"); > - spin_unlock(&client->lock); > - } > - > - rcu_read_unlock(); > + if (add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len)) > + pr_debug("smi_event(vmfault): no space left\n"); > } > > int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > index a9cb218fef96..15537b2cccb5 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -25,5 +25,7 @@ > > int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); > void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); > +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > + uint32_t throttle_bitmask); > > #endif > diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > index b197dcaed064..26f4f28848c9 100644 > --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c > @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle) > mutex_init(&smu->message_lock); > > INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn); > + smu->throttle_int_counter = 0; > smu->watermarks_bitmap = 0; > smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; > smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; > diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c > index 9b68760dd35b..eb3a57593f69 100644 > --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c > @@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu) > > dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n", > log_buf); > + kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status); > } > > static const struct pptable_funcs arcturus_ppt_funcs = { > diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h > index 896b443f1ce8..0d8c70eca39d 100644 > --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h > +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h > @@ -446,6 +446,7 @@ struct smu_context > bool dc_controlled_by_gpio; > > struct work_struct throttling_logging_work; > + uint32_t throttle_int_counter; // Throttle interrupt counter The comment adds no useful information. The variable name is already quite descriptive. > }; > > struct i2c_adapter; > diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c > index fd82402065e6..67653dd7862d 100644 > --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c > +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c > @@ -1311,6 +1311,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, > smu_v11_0_ack_ac_dc_interrupt(&adev->smu); > break; > case 0x7: > + /* > + * Increment the throttle interrupt counter > + */ > + WRITE_ONCE(smu->throttle_int_counter, > + smu->throttle_int_counter+1); > + This is not atomic. Use atomic_t or atomic64_t if 32-bit may overflow in a reasonable time frame. Then you'd use atomic_set to initialize it, atomic_read to read it and atomic_inc_return to increment it. > if (!atomic_read(&adev->throttling_logging_enabled)) > return 0; > > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index f738c3b53f4e..1e083e20c23d 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -451,6 +451,7 @@ struct kfd_ioctl_import_dmabuf_args { > */ > /* Event type (defined by bitmask) */ > #define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 > +#define KFD_SMI_EVENT_THERMAL 0x0000000000000002 Maybe call it THERMAL_THROTTLE explicitly. There may be other types of thermal events in the future (e.g. exceeding a critical temperature or thermal shutdown). Regards, Felix > > struct kfd_ioctl_smi_events_args { > __u32 gpuid; /* to KFD */ _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx