On 2022-09-26 15:40, Philip Yang wrote:
Unified memory usage with xnack off is tracked to avoid oversubscribe
system memory. When switching xnack mode from off to on, subsequent
free ranges allocated with xnack off will not unreserve memory when
xnack is on, cause memory accounting unbalanced.
Here you describe half the problem.
When switching xnack mode from on to off, need reserve already allocated
svm range memory because subsequent free ranges will unreserve memory
with xnack off.
But here you describe the _other_ half of the solution. That's a bit
confusing. Maybe state the other half of the problem first and then
explain the solution that fixes both halves of the problem.
Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++++++++++++++++++----
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 +++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 ++
3 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 56f7307c21d2..938095478707 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1594,16 +1594,36 @@ static int kfd_ioctl_set_xnack_mode(struct file *filep,
if (args->xnack_enabled >= 0) {
if (!list_empty(&p->pqm.queues)) {
pr_debug("Process has user queues running\n");
- mutex_unlock(&p->mutex);
- return -EBUSY;
+ r = -EBUSY;
+ goto out_unlock;
}
- if (args->xnack_enabled && !kfd_process_xnack_mode(p, true))
+
+ if (p->xnack_enabled == args->xnack_enabled)
+ goto out_unlock;
+
+ if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) {
r = -EPERM;
- else
- p->xnack_enabled = args->xnack_enabled;
+ goto out_unlock;
+ }
+
+ pr_debug("switching xnack from %d to %d\n", p->xnack_enabled,
+ args->xnack_enabled);
+
+ mutex_lock(&p->svms.lock);
It would be cleaner to do the locking inside svm_range_list_unreserve_mem.
+
+ /* Switching to XNACK on/off, unreserve/reserve memory of all
+ * svm ranges. Change xnack must be inside svms lock, to avoid
+ * race with svm_range_deferred_list_work unreserve memory.
+ */
+ p->xnack_enabled = args->xnack_enabled;
+ svm_range_list_unreserve_mem(p, p->xnack_enabled);
+
+ mutex_unlock(&p->svms.lock);
} else {
args->xnack_enabled = p->xnack_enabled;
}
+
+out_unlock:
mutex_unlock(&p->mutex);
return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index cf5b4005534c..5a82d5660470 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -278,7 +278,7 @@ static void svm_range_free(struct svm_range *prange, bool update_mem_usage)
svm_range_free_dma_mappings(prange);
if (update_mem_usage && !p->xnack_enabled) {
- pr_debug("unreserve mem limit: %lld\n", size);
+ pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
}
@@ -2956,6 +2956,24 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
return r;
}
+void svm_range_list_unreserve_mem(struct kfd_process *p, bool unreserve)
+{
+ struct svm_range *prange;
+ uint64_t size;
+
+ list_for_each_entry(prange, &p->svms.list, list) {
+ size = (prange->last - prange->start + 1) << PAGE_SHIFT;
+ pr_debug("svms 0x%p %s prange 0x%p size 0x%llx\n", &p->svms,
+ unreserve ? "unreserve" : "reserve", prange, size);
+ if (unreserve)
+ amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
+ KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
+ else
+ amdgpu_amdkfd_reserve_mem_limit(NULL, size,
+ KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
You are assuming that this will succeed. If it fails, you will end up
with unbalanced accounting. You'll need to detect failures and roll back
any reservations when a failure happens. Then fail the entire XNACK mode
change.
+ }
+}
+
void svm_range_list_fini(struct kfd_process *p)
{
struct svm_range *prange;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 012c53729516..05a2135cd56e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -203,10 +203,12 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
void svm_range_set_max_pages(struct amdgpu_device *adev);
+void svm_range_list_unreserve_mem(struct kfd_process *p, bool unreserve);
#else
struct kfd_process;
+void svm_range_list_unreserve_mem(struct kfd_process *p, bool unreserve) { }
I think it would be cleaner to put the entire kfd_ioctl_set_xnack_mode
under an #ifdef, similar to kfd_ioctl_svm.
Regards,
Felix
static inline int svm_range_list_init(struct kfd_process *p)
{