Replace prange->mapped_to_gpu with prange->bitmap_mapped[], which is based on prange granularity, updated when map to GPUS or unmap from GPUs, to optimize multiple GPU map, unmap and retry fault recover. svm_range_is_mapped is false only if no parital range mapping on any GPUs. Split the bitmap_mapped when unmap from cpu to split the prange. Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 218 ++++++++++++++++++++++----- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 4 +- 2 files changed, 184 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 040dc32ad475..626e0dd4ec79 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -292,12 +292,12 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } - /* free dma_addr array for each gpu */ + /* free dma_addr array, bitmap_mapped for each gpu */ for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { - if (prange->dma_addr[gpuidx]) { + if (prange->dma_addr[gpuidx]) kvfree(prange->dma_addr[gpuidx]); - prange->dma_addr[gpuidx] = NULL; - } + if (prange->bitmap_mapped[gpuidx]) + bitmap_free(prange->bitmap_mapped[gpuidx]); } mutex_destroy(&prange->lock); @@ -323,19 +323,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, uint64_t size = last - start + 1; struct svm_range *prange; struct kfd_process *p; - - prange = kzalloc(sizeof(*prange), GFP_KERNEL); - if (!prange) - return NULL; + unsigned int nbits; + uint32_t gpuidx; p = container_of(svms, struct kfd_process, svms); if (!p->xnack_enabled && update_mem_usage && amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) { pr_info("SVM mapping failed, exceeds resident system memory limit\n"); - kfree(prange); return NULL; } + + prange = kzalloc(sizeof(*prange), GFP_KERNEL); + if (!prange) + return NULL; + + svm_range_set_default_attributes(&prange->preferred_loc, + &prange->prefetch_loc, + &prange->granularity, &prange->flags); + + nbits = svm_range_mapped_nbits(size, prange->granularity); + pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_mapped nbits %d\n", prange, + start, last, nbits); + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + prange->bitmap_mapped[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL); + if (!prange->bitmap_mapped[gpuidx]) { + while (gpuidx--) + bitmap_free(prange->bitmap_mapped[gpuidx]); + kfree(prange); + return NULL; + } + } + prange->npages = size; prange->svms = svms; prange->start = start; @@ -354,10 +373,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); - svm_range_set_default_attributes(&prange->preferred_loc, - &prange->prefetch_loc, - &prange->granularity, &prange->flags); - pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); return prange; @@ -972,6 +987,48 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, return 0; } +static int +svm_range_split_bitmap_mapped(struct svm_range *new, struct svm_range *old, + uint64_t start, uint64_t last) +{ + struct kfd_process *p = container_of(new->svms, struct kfd_process, svms); + unsigned int nbits, old_nbits, old_nbits2; + unsigned long *bits; + uint32_t gpuidx; + + nbits = svm_range_mapped_nbits(new->npages, new->granularity); + old_nbits = svm_range_mapped_nbits(old->npages, old->granularity); + old_nbits2 = svm_range_mapped_nbits(last - start + 1, old->granularity);
This may be off by one if start and last are not aligned on
granularity boundaries. I think you need to calculate the index
for each of start and last and subtract the indices. E.g.
granularity = 9, start = 511, last = 512. last - start + 1 is 2
and the division tells you you need one bit. But this range
touches two different granules, so you need two bits.
+ + pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n", + old, old->start, old->last, start, last, old_nbits, old_nbits2); + pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, new->last, + nbits); + + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + bits = bitmap_alloc(old_nbits2, GFP_KERNEL); + if (!bits) + return -ENOMEM; + + if (start == old->start) { + bitmap_shift_right(new->bitmap_mapped[gpuidx], + old->bitmap_mapped[gpuidx], + old_nbits2, old_nbits); + bitmap_shift_right(bits, old->bitmap_mapped[gpuidx], 0, + old_nbits2);
Isn't this (shift = 0) the same as bitmap_copy?
+ } else { + bitmap_copy(new->bitmap_mapped[gpuidx], + old->bitmap_mapped[gpuidx], nbits); + bitmap_shift_right(bits, old->bitmap_mapped[gpuidx], + nbits, old_nbits); + } + bitmap_free(old->bitmap_mapped[gpuidx]); + old->bitmap_mapped[gpuidx] = bits; + } + + return 0; +} + /** * svm_range_split_adjust - split range and adjust * @@ -1012,6 +1069,10 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, return r; } + r = svm_range_split_bitmap_mapped(new, old, start, last); + if (r) + return r; + old->npages = last - start + 1; old->start = start; old->last = last; @@ -1020,7 +1081,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, new->prefetch_loc = old->prefetch_loc; new->actual_loc = old->actual_loc; new->granularity = old->granularity; - new->mapped_to_gpu = old->mapped_to_gpu; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); @@ -1310,6 +1370,84 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, fence); } +/** + * svm_range_partial_mapped_dev - check if prange mapped on the specific gpu + * + * @gpuidx: the gpu to check + * @prange: prange to check + * @start: the start address in pages + * @last: the last address in pages + * + * Return: + * true: if any partial range mapped on gpu based on granularity boundary + * false: if the entire range not mapped + */ +static bool +svm_range_partial_mapped_dev(uint32_t gpuidx, struct svm_range *prange, + unsigned long start, unsigned long last) +{ + unsigned long index, off; + bool mapped = false; + + start = max(start, prange->start); + last = min(last, prange->last); + if (last < start) + goto out; + + for (off = start; off <= last; off += (1UL << prange->granularity)) {
It would be easier to just iterate over indexes instead of offsets. And even more efficient to avoid testing individual bits by using a higher level bitmap function, e.g. bitmap_find_next_bit E.g.
unsigned long start_index, last_index; start = max(start, prange->start); last = min(last, prange->last); if (last < start) goto out; start_index = (start - prange->start) >> prange->granularity; last_index = (last - prange->start) >> prange->granularity; return find_next_bit(prange->bitmap_mapped[gpuidx], last_index + 1, start_index) <= last_index;
+ index = (off - prange->start) >> prange->granularity; + if (test_bit(index, prange->bitmap_mapped[gpuidx])) { + mapped = true; + break; + } + } +out: + pr_debug("prange 0x%p [0x%lx 0x%lx] mapped %d on gpu %d\n", prange, + start, last, mapped, gpuidx); + return mapped; +} + +static bool +svm_range_partial_mapped(struct svm_range *prange, unsigned long start, + unsigned long last) +{ + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); + struct svm_range *pchild; + uint32_t gpuidx; + + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + list_for_each_entry(pchild, &prange->child_list, child_list) { + if (svm_range_partial_mapped_dev(gpuidx, pchild, start, last)) + return true; + } + + if (svm_range_partial_mapped_dev(gpuidx, prange, start, last)) + return true; + } + return false; +} + +static bool svm_range_is_mapped(struct svm_range *prange) +{ + return svm_range_partial_mapped(prange, prange->start, prange->last); +} + +static void +svm_range_update_mapped(uint32_t gpuidx, struct svm_range *prange, + unsigned long start, unsigned long last, bool mapped) +{ + unsigned long index, nbits; + + index = (start - prange->start) >> prange->granularity; + nbits = svm_range_mapped_nbits(last - start + 1, prange->granularity);
This may be off by one if start and last are not aligned on
granularity boundaries. I think you need to calculate the index
for each of start and last and subtract the indices.
+ if (mapped) + bitmap_set(prange->bitmap_mapped[gpuidx], index, nbits); + else + bitmap_clear(prange->bitmap_mapped[gpuidx], index, nbits); + pr_debug("prange 0x%p [0x%lx 0x%lx] update mapped %d nbits %ld gpu %d\n", + prange, start, last, mapped, nbits, gpuidx); +} + static int svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, unsigned long last, uint32_t trigger) @@ -1321,29 +1459,28 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, uint32_t gpuidx; int r = 0; - if (!prange->mapped_to_gpu) { - pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n", - prange, prange->start, prange->last); - return 0; - } - - if (prange->start == start && prange->last == last) { - pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange); - prange->mapped_to_gpu = false; - } - bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, MAX_GPU_INSTANCE); p = container_of(prange->svms, struct kfd_process, svms); for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { - pr_debug("unmap from gpu idx 0x%x\n", gpuidx); pdd = kfd_process_device_from_gpuidx(p, gpuidx); if (!pdd) { pr_debug("failed to find device idx %d\n", gpuidx); - return -EINVAL; + continue; + } + + if (!svm_range_partial_mapped_dev(gpuidx, prange, start, last)) { + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not mapped on gpu %d\n", + prange->svms, prange, start, last, gpuidx); + continue; } + svm_range_update_mapped(gpuidx, prange, start, last, false); + + pr_debug("unmap svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu %d\n", + prange->svms, prange, start, last, gpuidx); + kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid, start, last, trigger); @@ -1483,6 +1620,10 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, if (r) break; + if (!r) + svm_range_update_mapped(gpuidx, prange, prange->start + offset, + prange->start + offset + npages - 1, true); + if (fence) { r = dma_fence_wait(fence, false); dma_fence_put(fence); @@ -1648,7 +1789,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); - if (!prange->mapped_to_gpu || + if (!svm_range_is_mapped(prange) ||
I think this gives you the wrong answer. As I understand it,
svm_range_is_mapped returns true, if any part of the range is
currently mapped on any GPU. But you'd only want to skip
validate_and_map is all of the range is currently mapped on the
subset of GPUs you're interested in.
Doesn't svm_range_is_mapped already consider child ranges? So you don't need to set mapped here.bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { r = 0; goto free_ctx; @@ -1729,9 +1870,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm, r = svm_range_map_to_gpus(prange, offset, npages, readonly, ctx->bitmap, flush_tlb); - if (!r && next == end) - prange->mapped_to_gpu = true; - svm_range_unlock(prange); addr = next; @@ -1900,10 +2038,10 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, if (!p->xnack_enabled || (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) { int evicted_ranges; - bool mapped = prange->mapped_to_gpu; + bool mapped = svm_range_is_mapped(prange); list_for_each_entry(pchild, &prange->child_list, child_list) { - if (!pchild->mapped_to_gpu) + if (!svm_range_is_mapped(pchild)) continue; mapped = true;
mutex_lock_nested(&pchild->lock, 1); @@ -1966,7 +2104,9 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, static struct svm_range *svm_range_clone(struct svm_range *old) { + struct kfd_process *p = container_of(old->svms, struct kfd_process, svms); struct svm_range *new; + uint32_t gpuidx; new = svm_range_new(old->svms, old->start, old->last, false); if (!new) @@ -1988,7 +2128,11 @@ static struct svm_range *svm_range_clone(struct svm_range *old) new->prefetch_loc = old->prefetch_loc; new->actual_loc = old->actual_loc; new->granularity = old->granularity; - new->mapped_to_gpu = old->mapped_to_gpu; + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + bitmap_copy(new->bitmap_mapped[gpuidx], old->bitmap_mapped[gpuidx], + svm_range_mapped_nbits(new->last - new->start + 1, + new->granularity)); + }; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); @@ -2107,7 +2251,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, next_start = min(node->last, last) + 1; if (svm_range_is_same_attrs(p, prange, nattr, attrs) && - prange->mapped_to_gpu) { + svm_range_is_mapped(prange)) {
This is probably wrong too. I think you need a check, whether a
specific range is completely mapped on all GPUs that need access.
Regards,
Felix
/* nothing to do */ } else if (node->start < start || node->last > last) { /* node intersects the update range and its attributes @@ -3587,7 +3731,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, if (migrated && (!p->xnack_enabled || (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) && - prange->mapped_to_gpu) { + svm_range_is_mapped(prange)) { pr_debug("restore_work will update mappings of GPUs\n"); mutex_unlock(&prange->migrate_mutex); continue; @@ -3598,7 +3742,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, continue; } - flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; + flush_tlb = !migrated && update_mapping && svm_range_is_mapped(prange); r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, true, flush_tlb); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index d2e94d8fb4be..10c92c5e23a7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -132,7 +132,7 @@ struct svm_range { struct list_head child_list; DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); - bool mapped_to_gpu; + unsigned long *bitmap_mapped[MAX_GPU_INSTANCE]; }; static inline void svm_range_lock(struct svm_range *prange) @@ -163,6 +163,8 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) return NULL; } +#define svm_range_mapped_nbits(size, granularity) DIV_ROUND_UP((size), 1UL << (granularity)) + int svm_range_list_init(struct kfd_process *p); void svm_range_list_fini(struct kfd_process *p); int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,