On 10/10/2023 9:40 AM, Philip Yang wrote:
Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
Remove prange validate_timestamp which is not accurate for multiple
GPUs.
Use the bitmap_mapped flag to skip the retry fault from different pages
of the same range if the range is already mapped on the specific GPU.
This should be "different pages of same granularity range" as comments
inside function svm_range_restore_page.
Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 ++++++++----------------
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 -
2 files changed, 8 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index daa996d7039d..0ee5633c8972 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -43,10 +43,6 @@
#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
-/* Long enough to ensure no retry fault comes after svm range is restored and
- * page table is updated.
- */
-#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC)
#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG)
#define dynamic_svm_range_dump(svms) \
_dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
@@ -369,7 +365,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
INIT_LIST_HEAD(&prange->deferred_list);
INIT_LIST_HEAD(&prange->child_list);
atomic_set(&prange->invalid, 0);
- prange->validate_timestamp = 0;
prange->vram_pages = 0;
mutex_init(&prange->migrate_mutex);
mutex_init(&prange->lock);
@@ -1938,8 +1933,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
}
svm_range_unreserve_bos(ctx);
- if (!r)
- prange->validate_timestamp = ktime_get_boottime();
free_ctx:
kfree(ctx);
@@ -3214,15 +3207,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out_unlock_range;
}
- /* skip duplicate vm fault on different pages of same range */
- if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
- AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
- pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
- svms, prange->start, prange->last);
- r = 0;
- goto out_unlock_range;
- }
-
/* __do_munmap removed VMA, return success as we are handling stale
* retry fault.
*/
@@ -3248,6 +3232,14 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out_unlock_range;
}
+ /* skip duplicate vm fault on different pages of same granularity range */
+ if (svm_range_partial_mapped_dev(gpuidx, prange, addr, addr)) {
should use svm_range_complete_mapped on gpuidx, instead of
svm_range_partial_mapped_dev?
Regards
Xiaogang
+ pr_debug("svms 0x%p [0x%lx %lx] addr 0x%llx already mapped on gpu %d\n",
+ svms, prange->start, prange->last, addr, gpuidx);
+ r = 0;
+ goto out_unlock_range;
+ }
+
pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
svms, prange->start, prange->last, best_loc,
prange->actual_loc);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 7e165854bc0e..7a12be42cf16 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -129,7 +129,6 @@ struct svm_range {
uint32_t actual_loc;
uint8_t granularity;
atomic_t invalid;
- ktime_t validate_timestamp;
struct mmu_interval_notifier notifier;
struct svm_work_list_item work_item;
struct list_head deferred_list;
--
2.35.1