Re: [PATCH v5 3/11] drm/amdkfd: Add GPU recoverable fault SMI event

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2022-06-30 10:19, Felix Kuehling wrote:

Am 2022-06-28 um 10:50 schrieb Philip Yang:
Use ktime_get_boottime_ns() as timestamp to correlate with other
APIs. Output timestamp when GPU recoverable fault starts and ends to
recover the fault, if migration happened or only GPU page table is
updated to recover, fault address, if read or write fault.

Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17 +++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  6 +++++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c        | 17 +++++++++++++----
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h        |  2 +-
  4 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 55ed026435e2..b7e68283925f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
                task_info.pid, task_info.task_name);
  }
  +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+                    unsigned long address, bool write_fault,
+                    ktime_t ts)
+{
+    kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
+              "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
+              address, dev->id, write_fault ? 'W' : 'R');
+}
+
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+                  unsigned long address, bool migration)
+{
+    kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
+              "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
+              pid, address, dev->id, migration ? 'M' : 'U');
+}
+
  int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
  {
      struct kfd_smi_client *client;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index dfe101c21166..7903718cd9eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
  void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
                           uint64_t throttle_bitmask);
  void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
-
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+                    unsigned long address, bool write_fault,
+                    ktime_t ts);
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+                  unsigned long address, bool migration);
  #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index d6fc00d51c8c..2ad08a1f38dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -32,6 +32,7 @@
  #include "kfd_priv.h"
  #include "kfd_svm.h"
  #include "kfd_migrate.h"
+#include "kfd_smi_events.h"
    #ifdef dev_fmt
  #undef dev_fmt
@@ -1617,7 +1618,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
      svm_range_unreserve_bos(&ctx);
        if (!r)
-        prange->validate_timestamp = ktime_to_us(ktime_get());
+        prange->validate_timestamp = ktime_get_boottime();
        return r;
  }
@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
      struct svm_range_list *svms;
      struct svm_range *prange;
      struct kfd_process *p;
-    uint64_t timestamp;
+    ktime_t timestamp = ktime_get_boottime();
      int32_t best_loc;
      int32_t gpuidx = MAX_GPU_INSTANCE;
      bool write_locked = false;
      struct vm_area_struct *vma;
+    bool migration = false;
      int r = 0;
        if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
          goto out_unlock_range;
      }
  -    timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
      /* skip duplicate vm fault on different pages of same range */
-    if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
+    if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
+                AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {

You changed the timestamp units from us to ns. I think you'll need to update AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (multiply with 1000) to account for that.

Thanks for catching this, the change was in v4, I removed it by mistake when updating comments.

Philip


Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>


          pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
               svms, prange->start, prange->last);
          r = 0;
@@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
           svms, prange->start, prange->last, best_loc,
           prange->actual_loc);
  +    kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
+                       write_fault, timestamp);
+
      if (prange->actual_loc != best_loc) {
+        migration = true;
          if (best_loc) {
              r = svm_migrate_to_vram(prange, best_loc, mm);
              if (r) {
@@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
          pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
               r, svms, prange->start, prange->last);
  +    kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
+                     migration);
+
  out_unlock_range:
      mutex_unlock(&prange->migrate_mutex);
  out_unlock_svms:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 2d54147b4dda..eab7f6d3b13c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -125,7 +125,7 @@ struct svm_range {
      uint32_t            actual_loc;
      uint8_t                granularity;
      atomic_t            invalid;
-    uint64_t            validate_timestamp;
+    ktime_t                validate_timestamp;
      struct mmu_interval_notifier    notifier;
      struct svm_work_list_item    work_item;
      struct list_head        deferred_list;

[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux