Re: [PATCH 1/1] drm/amdkfd: evict svm bo worker handle error

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 2022-03-14 um 10:50 schrieb Philip Yang:
Migrate vram to ram may fail to find the vma if process is exiting
and vma is removed, evict svm bo worker sets prange->svm_bo to NULL
and warn svm_bo ref count != 1 only if migrating vram to ram
successfully.

Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 27 +++++++++++++++++++-----
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 20 ++++++++++--------
  2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 7f689094be5a..c8aef2fe459b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -638,6 +638,22 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
  	return r;
  }
+/**
+ * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
+ *
+ * @adev: amdgpu device to migrate from
+ * @prange: svm range structure
+ * @vma: vm_area_struct that range [start, end] belongs to
+ * @start: range start virtual address in pages
+ * @end: range end virtual address in pages
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock
+ *
+ * Return:
+ *   0 - success with all pages migrated
+ *   negative values - indicate error
+ *   positive values - partial migration, number of pages not migrated
+ */
  static long
  svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
  		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
@@ -709,8 +725,6 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
  		pdd = svm_range_get_pdd_by_adev(prange, adev);
  		if (pdd)
  			WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
-
-		return upages;
  	}
  	return r ? r : upages;
  }
@@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
  		unsigned long next;
vma = find_vma(mm, addr);
-		if (!vma || addr < vma->vm_start)
+		if (!vma || addr < vma->vm_start) {
+			pr_debug("failed to find vma for prange %p\n", prange);
+			r = -EFAULT;
  			break;
+		}
next = min(vma->vm_end, end);
  		r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next);
  		if (r < 0) {
-			pr_debug("failed %ld to migrate\n", r);
+			pr_debug("failed %ld to migrate prange %p\n", r, prange);
  			break;
  		} else {
  			upages += r;
@@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm)
  		addr = next;
  	}
- if (!upages) {
+	if (r >= 0 && !upages) {
  		svm_range_vram_node_free(prange);
  		prange->actual_loc = 0;
  	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 509d915cbe69..215943424c06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
  	struct svm_range_bo *svm_bo;
  	struct kfd_process *p;
  	struct mm_struct *mm;
+	int r = 0;
svm_bo = container_of(work, struct svm_range_bo, eviction_work);
  	if (!svm_bo_ref_unless_zero(svm_bo))
@@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
mmap_read_lock(mm);
  	spin_lock(&svm_bo->list_lock);
-	while (!list_empty(&svm_bo->range_list)) {
+	while (!list_empty(&svm_bo->range_list) && !r) {
  		struct svm_range *prange =
  				list_first_entry(&svm_bo->range_list,
  						struct svm_range, svm_bo_list);
@@ -3184,15 +3185,15 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
mutex_lock(&prange->migrate_mutex);
  		do {
-			svm_migrate_vram_to_ram(prange,
+			r = svm_migrate_vram_to_ram(prange,
  						svm_bo->eviction_fence->mm);
-		} while (prange->actual_loc && --retries);
-		WARN(prange->actual_loc, "Migration failed during eviction");
-
-		mutex_lock(&prange->lock);
-		prange->svm_bo = NULL;
-		mutex_unlock(&prange->lock);
+		} while (!r && prange->actual_loc && --retries);

I think it would still be good to have a WARN here for other cases than process termination. Is there a way to distinguish the process termination case from the error code? Maybe we could even avoid the retry in this case.

Other than that, this patch is a good improvement on the error handling.

Regards,
  Felix


+ if (!prange->actual_loc) {
+			mutex_lock(&prange->lock);
+			prange->svm_bo = NULL;
+			mutex_unlock(&prange->lock);
+		}
  		mutex_unlock(&prange->migrate_mutex);
spin_lock(&svm_bo->list_lock);
@@ -3201,10 +3202,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
  	mmap_read_unlock(mm);
dma_fence_signal(&svm_bo->eviction_fence->base);
+
  	/* This is the last reference to svm_bo, after svm_range_vram_node_free
  	 * has been called in svm_migrate_vram_to_ram
  	 */
-	WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
+	WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
  	svm_range_bo_unref(svm_bo);
  }



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux