By using time stamp of page fault do not need drain page faults at
deferred
work. So, the time period that kfd does not handle page faults is
reduced
and can be controlled.
Signed-off-by: Xiaogang.Chen <Xiaogang.Chen@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +-
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 102 ++++++++++++++++---------
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
7 files changed, 79 insertions(+), 43 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..d90b7ea3f020 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
void *data, struct drm_file *filp)
* shouldn't be reported any more.
*/
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
- u32 vmid, u32 node_id, uint64_t addr,
+ u32 vmid, u32 node_id, uint64_t addr,
uint64_t ts,
bool write_fault)
{
bool is_compute_context = false;
@@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct
amdgpu_device *adev, u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
if (is_compute_context && !svm_range_restore_pages(adev, pasid,
vmid,
- node_id, addr, write_fault)) {
+ node_id, addr, ts, write_fault)) {
amdgpu_bo_unref(&root);
return true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 312a408b80d3..1d6a1381ede9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
- u32 vmid, u32 node_id, uint64_t addr,
+ u32 vmid, u32 node_id, uint64_t addr,
uint64_t ts,
bool write_fault);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..3596cc2ee7e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
/* Try to handle the recoverable page faults by filling
page
* tables
*/
- if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0,
addr, write_fault))
+ if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+ entry->timestamp, write_fault))
return 1;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 350f6b6676f1..ac08d9424feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
cam_index = entry->src_data[2] & 0x3ff;
ret = amdgpu_vm_handle_fault(adev,
entry->pasid, entry->vmid, node_id,
- addr, write_fault);
+ addr,
entry->timestamp, write_fault);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
if (ret)
return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid,
entry->vmid, node_id,
- addr, write_fault))
+ addr,
entry->timestamp, write_fault))
return 1;
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c51e908f6f19..771c98e104ee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -850,10 +850,13 @@ struct svm_range_list {
struct list_head criu_svm_metadata_list;
spinlock_t deferred_list_lock;
atomic_t evicted_ranges;
- atomic_t drain_pagefaults;
+ /* stop page fault recovery for this process */
+ atomic_t stop_pf_recovery;
struct delayed_work restore_work;
DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
struct task_struct *faulting_task;
+ /* check point ts decides if page fault recovery need be
dropped */
+ uint64_t checkpoint_ts[MAX_GPU_INSTANCE];
};
/* Process data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..fb0e883868b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2263,16 +2263,10 @@ static void
svm_range_drain_retry_fault(struct svm_range_list *svms)
{
struct kfd_process_device *pdd;
struct kfd_process *p;
- int drain;
uint32_t i;
p = container_of(svms, struct kfd_process, svms);
-restart:
- drain = atomic_read(&svms->drain_pagefaults);
- if (!drain)
- return;
-
for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
pdd = p->pdds[i];
if (!pdd)
@@ -2292,8 +2286,6 @@ static void svm_range_drain_retry_fault(struct
svm_range_list *svms)
pr_debug("drain retry fault gpu %d svms 0x%p done\n",
i, svms);
}
- if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
- goto restart;
}
static void svm_range_deferred_list_work(struct work_struct *work)
@@ -2315,17 +2307,8 @@ static void
svm_range_deferred_list_work(struct work_struct *work)
prange->start, prange->last,
prange->work_item.op);
mm = prange->work_item.mm;
-retry:
- mmap_write_lock(mm);
- /* Checking for the need to drain retry faults must be
inside
- * mmap write lock to serialize with munmap notifiers.
- */
- if (unlikely(atomic_read(&svms->drain_pagefaults))) {
- mmap_write_unlock(mm);
- svm_range_drain_retry_fault(svms);
- goto retry;
- }
+ mmap_write_lock(mm);
/* Remove from deferred_list must be inside mmap write
lock, for
* two race cases:
@@ -2446,6 +2429,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm,
struct svm_range *prange,
struct kfd_process *p;
unsigned long s, l;
bool unmap_parent;
+ uint32_t i;
p = kfd_lookup_process_by_mm(mm);
if (!p)
@@ -2455,11 +2439,37 @@ svm_range_unmap_from_cpu(struct mm_struct
*mm, struct svm_range *prange,
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
svms,
prange, prange->start, prange->last, start, last);
- /* Make sure pending page faults are drained in the deferred
worker
- * before the range is freed to avoid straggler interrupts on
- * unmapped memory causing "phantom faults".
+ /* calculate time stamps that are used to decide which page
faults need be
+ * dropped or handled before unmap pages from gpu vm
*/
- atomic_inc(&svms->drain_pagefaults);
+ for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
+ struct kfd_process_device *pdd;
+ struct amdgpu_device *adev;
+ struct amdgpu_ih_ring *ih;
+ uint32_t checkpoint_wptr;
+
+ pdd = p->pdds[i];
+ if (!pdd)
+ continue;
+
+ adev = pdd->dev->adev;
+
+ /* check if adev->irq.ih1 is not empty */
+ ih = &adev->irq.ih1;
+ checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
+ if (ih->rptr != checkpoint_wptr) {
+ WRITE_ONCE(svms->checkpoint_ts[i],
checkpoint_wptr);
+ continue;
+ }
+
+ /* check if dev->irq.ih_soft is not empty */
+ ih = &adev->irq.ih_soft;
+ checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
+ if (ih->rptr != checkpoint_wptr) {
+ WRITE_ONCE(svms->checkpoint_ts[i],
checkpoint_wptr);
+ continue;
+ }
+ }
unmap_parent = start <= prange->start && last >= prange->last;
@@ -2900,7 +2910,7 @@ svm_fault_allowed(struct vm_area_struct *vma,
bool write_fault)
int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int
pasid,
uint32_t vmid, uint32_t node_id,
- uint64_t addr, bool write_fault)
+ uint64_t addr, uint64_t ts, bool write_fault)
{
unsigned long start, last, size;
struct mm_struct *mm = NULL;
@@ -2910,7 +2920,7 @@ svm_range_restore_pages(struct amdgpu_device
*adev, unsigned int pasid,
ktime_t timestamp = ktime_get_boottime();
struct kfd_node *node;
int32_t best_loc;
- int32_t gpuidx = MAX_GPU_INSTANCE;
+ int32_t gpuid, gpuidx = MAX_GPU_INSTANCE;
bool write_locked = false;
struct vm_area_struct *vma;
bool migration = false;
@@ -2930,12 +2940,39 @@ svm_range_restore_pages(struct amdgpu_device
*adev, unsigned int pasid,
pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms,
addr);
- if (atomic_read(&svms->drain_pagefaults)) {
- pr_debug("draining retry fault, drop fault 0x%llx\n",
addr);
+ /* kfd page fault recovery is disabled */
+ if (atomic_read(&svms->stop_pf_recovery)) {
+ pr_debug("page fault handing disabled, drop fault
0x%llx\n", addr);
r = 0;
goto out;
}
+ node = kfd_node_by_irq_ids(adev, node_id, vmid);
+ if (!node) {
+ pr_debug("kfd node does not exist node_id: %d, vmid:
%d\n", node_id,
+ vmid);
+ r = -EFAULT;
+ goto out;
+ }
+
+ if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
+ pr_debug("failed to get gpuid/gpuidex for node_id: %d
\n", node_id);
+ r = -EFAULT;
+ goto out;
+ }
+
+ /* check if this page fault time stamp is before
svms->checkpoint_ts */
+ if (READ_ONCE(svms->checkpoint_ts[gpuidx]) != 0 &&
+ amdgpu_ih_ts_after(ts,
READ_ONCE(svms->checkpoint_ts[gpuidx]))) {
+ pr_debug("draining retry fault, drop fault 0x%llx\n",
addr);
+ r = 0;
+ goto out;
+ } else
+ /* ts is after svms->checkpoint_ts now, reset
svms->checkpoint_ts
+ * to zero to avoid following ts wrap around give wrong
comparing
+ */
+ WRITE_ONCE(svms->checkpoint_ts[gpuidx], 0);