Re: [PATCH 1/3] amd/amdkfd: Add granularity bitmap mapped to gpu flag

Philip Yang <yangp@xxxxxxx> · Thu, 5 Oct 2023 15:28:12 -0400

On 2023-10-02 14:35, Felix Kuehling
      wrote:

    
    
      
      

      
      On 2023-09-29 10:11, Philip Yang
        wrote:

      
      
        Replace prange->mapped_to_gpu with prange->bitmap_mapped[], which is
based on prange granularity, updated when map to GPUS or unmap from
GPUs, to optimize multiple GPU map, unmap and retry fault recover.

svm_range_is_mapped is false only if no parital range mapping on any
GPUs.

Split the bitmap_mapped when unmap from cpu to split the prange.

Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 218 ++++++++++++++++++++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |   4 +-
 2 files changed, 184 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 040dc32ad475..626e0dd4ec79 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -292,12 +292,12 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
 					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
 	}
 
-	/* free dma_addr array for each gpu */
+	/* free dma_addr array, bitmap_mapped for each gpu */
 	for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
-		if (prange->dma_addr[gpuidx]) {
+		if (prange->dma_addr[gpuidx])
 			kvfree(prange->dma_addr[gpuidx]);
-				prange->dma_addr[gpuidx] = NULL;
-		}
+		if (prange->bitmap_mapped[gpuidx])
+			bitmap_free(prange->bitmap_mapped[gpuidx]);
 	}
 
 	mutex_destroy(&prange->lock);
@@ -323,19 +323,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 	uint64_t size = last - start + 1;
 	struct svm_range *prange;
 	struct kfd_process *p;
-
-	prange = kzalloc(sizeof(*prange), GFP_KERNEL);
-	if (!prange)
-		return NULL;
+	unsigned int nbits;
+	uint32_t gpuidx;
 
 	p = container_of(svms, struct kfd_process, svms);
 	if (!p->xnack_enabled && update_mem_usage &&
 	    amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
 				    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
 		pr_info("SVM mapping failed, exceeds resident system memory limit\n");
-		kfree(prange);
 		return NULL;
 	}
+
+	prange = kzalloc(sizeof(*prange), GFP_KERNEL);
+	if (!prange)
+		return NULL;
+
+	svm_range_set_default_attributes(&prange->preferred_loc,
+					 &prange->prefetch_loc,
+					 &prange->granularity, &prange->flags);
+
+	nbits = svm_range_mapped_nbits(size, prange->granularity);
+	pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_mapped nbits %d\n", prange,
+		 start, last, nbits);
+	for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+		prange->bitmap_mapped[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL);
+		if (!prange->bitmap_mapped[gpuidx]) {
+			while (gpuidx--)
+				bitmap_free(prange->bitmap_mapped[gpuidx]);
+			kfree(prange);
+			return NULL;
+		}
+	}
+
 	prange->npages = size;
 	prange->svms = svms;
 	prange->start = start;
@@ -354,10 +373,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
 		bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
 			    MAX_GPU_INSTANCE);
 
-	svm_range_set_default_attributes(&prange->preferred_loc,
-					 &prange->prefetch_loc,
-					 &prange->granularity, &prange->flags);
-
 	pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
 
 	return prange;
@@ -972,6 +987,48 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
 	return 0;
 }
 
+static int
+svm_range_split_bitmap_mapped(struct svm_range *new, struct svm_range *old,
+			      uint64_t start, uint64_t last)
+{
+	struct kfd_process *p = container_of(new->svms, struct kfd_process, svms);
+	unsigned int nbits, old_nbits, old_nbits2;
+	unsigned long *bits;
+	uint32_t gpuidx;
+
+	nbits = svm_range_mapped_nbits(new->npages, new->granularity);
+	old_nbits = svm_range_mapped_nbits(old->npages, old->granularity);
+	old_nbits2 = svm_range_mapped_nbits(last - start + 1, old->granularity);
      
      This may be off by one if start and last are not aligned on
        granularity boundaries. I think you need to calculate the index
        for each of start and last and subtract the indices. E.g.
        granularity = 9, start = 511, last = 512. last - start + 1 is 2
        and the division tells you you need one bit. But this range
        touches two different granules, so you need two bits.

      
    
    right, thanks, will check granularity boundary to calculate nbits.

    
       
      

      
      
        +
+	pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n",
+		 old, old->start, old->last, start, last, old_nbits, old_nbits2);
+	pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, new->last,
+		 nbits);
+
+	for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+		bits = bitmap_alloc(old_nbits2, GFP_KERNEL);
+		if (!bits)
+			return -ENOMEM;
+
+		if (start == old->start) {
+			bitmap_shift_right(new->bitmap_mapped[gpuidx],
+					   old->bitmap_mapped[gpuidx],
+					   old_nbits2, old_nbits);
+			bitmap_shift_right(bits, old->bitmap_mapped[gpuidx], 0,
+					   old_nbits2);
      
      Isn't this (shift = 0) the same as bitmap_copy?
    
    yes, this should use bitmap_copy.

    
      

      
      
        +		} else {
+			bitmap_copy(new->bitmap_mapped[gpuidx],
+				    old->bitmap_mapped[gpuidx], nbits);
+			bitmap_shift_right(bits, old->bitmap_mapped[gpuidx],
+					   nbits, old_nbits);
+		}
+		bitmap_free(old->bitmap_mapped[gpuidx]);
+		old->bitmap_mapped[gpuidx] = bits;
+	}
+
+	return 0;
+}
+
 /**
  * svm_range_split_adjust - split range and adjust
  *
@@ -1012,6 +1069,10 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
 			return r;
 	}
 
+	r = svm_range_split_bitmap_mapped(new, old, start, last);
+	if (r)
+		return r;
+
 	old->npages = last - start + 1;
 	old->start = start;
 	old->last = last;
@@ -1020,7 +1081,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
 	new->prefetch_loc = old->prefetch_loc;
 	new->actual_loc = old->actual_loc;
 	new->granularity = old->granularity;
-	new->mapped_to_gpu = old->mapped_to_gpu;
 	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
 	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
 
@@ -1310,6 +1370,84 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 				      fence);
 }
 
+/**
+ * svm_range_partial_mapped_dev - check if prange mapped on the specific gpu
+ *
+ * @gpuidx: the gpu to check
+ * @prange: prange to check
+ * @start: the start address in pages
+ * @last: the last address in pages
+ *
+ * Return:
+ * true: if any partial range mapped on gpu based on granularity boundary
+ * false: if the entire range not mapped
+ */
+static bool
+svm_range_partial_mapped_dev(uint32_t gpuidx, struct svm_range *prange,
+			     unsigned long start, unsigned long last)
+{
+	unsigned long index, off;
+	bool mapped = false;
+
+	start = max(start, prange->start);
+	last = min(last, prange->last);
+	if (last < start)
+		goto out;
+
+	for (off = start; off <= last; off += (1UL << prange->granularity)) {
      
      It would be easier to just iterate over indexes instead of
        offsets. And even more efficient to avoid testing individual
        bits by using a higher level bitmap function, e.g.
        bitmap_find_next_bit E.g.
      	unsigned long start_index, last_index;

	start = max(start, prange->start);
	last = min(last, prange->last);
	if (last < start)
		goto out;

	start_index = (start - prange->start) >> prange->granularity;
	last_index = (last - prange->start) >> prange->granularity;
	return find_next_bit(prange->bitmap_mapped[gpuidx], last_index + 1, start_index) <= last_index;
    
    thanks, it is better.

    
       

      
        +		index = (off - prange->start) >> prange->granularity;
+		if (test_bit(index, prange->bitmap_mapped[gpuidx])) {
+			mapped = true;
+			break;
+		}
+	}
+out:
+	pr_debug("prange 0x%p [0x%lx 0x%lx] mapped %d on gpu %d\n", prange,
+		 start, last, mapped, gpuidx);
+	return mapped;
+}
+
+static bool
+svm_range_partial_mapped(struct svm_range *prange, unsigned long start,
+			 unsigned long last)
+{
+	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
+	struct svm_range *pchild;
+	uint32_t gpuidx;
+
+	for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+		list_for_each_entry(pchild, &prange->child_list, child_list) {
+			if (svm_range_partial_mapped_dev(gpuidx, pchild, start, last))
+				return true;
+		}
+
+		if (svm_range_partial_mapped_dev(gpuidx, prange, start, last))
+			return true;
+	}
+	return false;
+}
+
+static bool svm_range_is_mapped(struct svm_range *prange)
+{
+	return svm_range_partial_mapped(prange, prange->start, prange->last);
+}
+
+static void
+svm_range_update_mapped(uint32_t gpuidx, struct svm_range *prange,
+			unsigned long start, unsigned long last, bool mapped)
+{
+	unsigned long index, nbits;
+
+	index = (start - prange->start) >> prange->granularity;
+	nbits = svm_range_mapped_nbits(last - start + 1, prange->granularity);
      
      This may be off by one if start and last are not aligned on
        granularity boundaries. I think you need to calculate the index
        for each of start and last and subtract the indices.

      
    
    yes, will correct it in function svm_range_mapped_nbits
    
       
      

      
      
        +	if (mapped)
+		bitmap_set(prange->bitmap_mapped[gpuidx], index, nbits);
+	else
+		bitmap_clear(prange->bitmap_mapped[gpuidx], index, nbits);
+	pr_debug("prange 0x%p [0x%lx 0x%lx] update mapped %d nbits %ld gpu %d\n",
+		 prange, start, last, mapped, nbits, gpuidx);
+}
+
 static int
 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
 			  unsigned long last, uint32_t trigger)
@@ -1321,29 +1459,28 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
 	uint32_t gpuidx;
 	int r = 0;
 
-	if (!prange->mapped_to_gpu) {
-		pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
-			 prange, prange->start, prange->last);
-		return 0;
-	}
-
-	if (prange->start == start && prange->last == last) {
-		pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
-		prange->mapped_to_gpu = false;
-	}
-
 	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
 		  MAX_GPU_INSTANCE);
 	p = container_of(prange->svms, struct kfd_process, svms);
 
 	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
-		pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
 		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
 		if (!pdd) {
 			pr_debug("failed to find device idx %d\n", gpuidx);
-			return -EINVAL;
+			continue;
+		}
+
+		if (!svm_range_partial_mapped_dev(gpuidx, prange, start, last)) {
+			pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not mapped on gpu %d\n",
+				 prange->svms, prange, start, last, gpuidx);
+			continue;
 		}
 
+		svm_range_update_mapped(gpuidx, prange, start, last, false);
+
+		pr_debug("unmap svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu %d\n",
+			 prange->svms, prange, start, last, gpuidx);
+
 		kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
 					     start, last, trigger);
 
@@ -1483,6 +1620,10 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
 		if (r)
 			break;
 
+		if (!r)
+			svm_range_update_mapped(gpuidx, prange, prange->start + offset,
+						prange->start + offset + npages - 1, true);
+
 		if (fence) {
 			r = dma_fence_wait(fence, false);
 			dma_fence_put(fence);
@@ -1648,7 +1789,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 
 	if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
 		bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
-		if (!prange->mapped_to_gpu ||
+		if (!svm_range_is_mapped(prange) ||
      
      I think this gives you the wrong answer. As I understand it,
        svm_range_is_mapped returns true, if any part of the range is
        currently mapped on any GPU. But you'd only want to skip
        validate_and_map is all of the range is currently mapped on the
        subset of GPUs you're interested in.

      
    
    Here is to skip update mapping if prange is NOT mapped or prange as
    no GPU access nor access-in-place attribute. if
    (!svm_range_is_mapped(prange)) means no any part of prange is mapped
    on any GPU, this is correct condition.

    
       
      

      
      
         		    bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
 			r = 0;
 			goto free_ctx;
@@ -1729,9 +1870,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 			r = svm_range_map_to_gpus(prange, offset, npages, readonly,
 						  ctx->bitmap, flush_tlb);
 
-		if (!r && next == end)
-			prange->mapped_to_gpu = true;
-
 		svm_range_unlock(prange);
 
 		addr = next;
@@ -1900,10 +2038,10 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
 	if (!p->xnack_enabled ||
 	    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
 		int evicted_ranges;
-		bool mapped = prange->mapped_to_gpu;
+		bool mapped = svm_range_is_mapped(prange);
 
 		list_for_each_entry(pchild, &prange->child_list, child_list) {
-			if (!pchild->mapped_to_gpu)
+			if (!svm_range_is_mapped(pchild))
 				continue;
 			mapped = true;
      
      Doesn't svm_range_is_mapped already consider child ranges? So you
      don't need to set mapped here.

    
    yes, this is duplicate checking, will remove it.

    
      

      
      
         			mutex_lock_nested(&pchild->lock, 1);
@@ -1966,7 +2104,9 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
 
 static struct svm_range *svm_range_clone(struct svm_range *old)
 {
+	struct kfd_process *p = container_of(old->svms, struct kfd_process, svms);
 	struct svm_range *new;
+	uint32_t gpuidx;
 
 	new = svm_range_new(old->svms, old->start, old->last, false);
 	if (!new)
@@ -1988,7 +2128,11 @@ static struct svm_range *svm_range_clone(struct svm_range *old)
 	new->prefetch_loc = old->prefetch_loc;
 	new->actual_loc = old->actual_loc;
 	new->granularity = old->granularity;
-	new->mapped_to_gpu = old->mapped_to_gpu;
+	for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+		bitmap_copy(new->bitmap_mapped[gpuidx], old->bitmap_mapped[gpuidx],
+			    svm_range_mapped_nbits(new->last - new->start + 1,
+						   new->granularity));
+	};
 	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
 	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
 
@@ -2107,7 +2251,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
 		next_start = min(node->last, last) + 1;
 
 		if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
-		    prange->mapped_to_gpu) {
+		    svm_range_is_mapped(prange)) {
      
      This is probably wrong too. I think you need a check, whether a
        specific range is completely mapped on all GPUs that need
        access.

      
    
    Will correct this retry handling, there is case that previous
      mapping may not finish on all GPUs completely.
    Regards,
    Philip

    
    
       
      Regards,

          Felix
      

      
      
         			/* nothing to do */
 		} else if (node->start < start || node->last > last) {
 			/* node intersects the update range and its attributes
@@ -3587,7 +3731,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 
 		if (migrated && (!p->xnack_enabled ||
 		    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
-		    prange->mapped_to_gpu) {
+		    svm_range_is_mapped(prange)) {
 			pr_debug("restore_work will update mappings of GPUs\n");
 			mutex_unlock(&prange->migrate_mutex);
 			continue;
@@ -3598,7 +3742,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 			continue;
 		}
 
-		flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
+		flush_tlb = !migrated && update_mapping && svm_range_is_mapped(prange);
 
 		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
 					       true, flush_tlb);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index d2e94d8fb4be..10c92c5e23a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -132,7 +132,7 @@ struct svm_range {
 	struct list_head		child_list;
 	DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
 	DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
-	bool				mapped_to_gpu;
+	unsigned long			*bitmap_mapped[MAX_GPU_INSTANCE];
 };
 
 static inline void svm_range_lock(struct svm_range *prange)
@@ -163,6 +163,8 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo)
 		return NULL;
 }
 
+#define svm_range_mapped_nbits(size, granularity) DIV_ROUND_UP((size), 1UL << (granularity))
+
 int svm_range_list_init(struct kfd_process *p);
 void svm_range_list_fini(struct kfd_process *p);
 int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,