Add more generic function amdgpu_copy_ttm_mem_to_mem() that supports arbitrary copy size, offsets and two BOs (source & dest.). This is useful for KFD Cross Memory Attach feature where data needs to be copied from BOs from different processes Change-Id: I848d541a84a1c2d12827d9dcf6d9054d854b4159 Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 159 ++++++++++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 7 ++ 2 files changed, 107 insertions(+), 59 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 1086f03..e5415fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -289,97 +289,138 @@ static uint64_t amdgpu_mm_node_addr(struct ttm_buffer_object *bo, return addr; } -static int amdgpu_move_blit(struct ttm_buffer_object *bo, - bool evict, bool no_wait_gpu, - struct ttm_mem_reg *new_mem, - struct ttm_mem_reg *old_mem) +/** + * amdgpu_copy_ttm_mem_to_mem - Helper function for copy + * + * @bo, @mem and @offset: All are array of 2 items. The function copies @size + * bytes from {mem[0] + offset[0]} to {mem[1] + offset[1]}. bo[0] and bo[1] + * could be same BO for a move and different for a BO to BO copy. + * + * @f: Returns the last fence if multiple jobs are submitted. + */ +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, + struct ttm_buffer_object *bo[2], + struct ttm_mem_reg *mem[2], + uint64_t offset[2], + uint64_t size, + struct reservation_object *resv, + struct dma_fence **f) { - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; - - struct drm_mm_node *old_mm, *new_mm; - uint64_t old_start, old_size, new_start, new_size; - unsigned long num_pages; + struct drm_mm_node *mm[2]; + uint64_t node_start[2], node_size[2], page_offset[2]; struct dma_fence *fence = NULL; - int r; - - BUILD_BUG_ON((PAGE_SIZE % AMDGPU_GPU_PAGE_SIZE) != 0); + int i, r = 0; + const uint64_t GTT_MAX_BYTES = (AMDGPU_GTT_MAX_TRANSFER_SIZE * + AMDGPU_GPU_PAGE_SIZE); if (!ring->ready) { DRM_ERROR("Trying to move memory with ring turned off.\n"); return -EINVAL; } - old_mm = old_mem->mm_node; - old_size = old_mm->size; - old_start = amdgpu_mm_node_addr(bo, old_mm, old_mem); - - new_mm = new_mem->mm_node; - new_size = new_mm->size; - new_start = amdgpu_mm_node_addr(bo, new_mm, new_mem); + for (i = 0; i < 2; i++) { + mm[i] = mem[i]->mm_node; + while (offset[i] >= (mm[i]->size << PAGE_SHIFT)) { + offset[i] -= (mm[i]->size << PAGE_SHIFT); + ++mm[i]; + } + node_start[i] = amdgpu_mm_node_addr(bo[i], mm[i], mem[i]) + + offset[i]; + node_size[i] = (mm[i]->size << PAGE_SHIFT) - offset[i]; + page_offset[i] = node_start[i] & (PAGE_SIZE - 1); + } - num_pages = new_mem->num_pages; mutex_lock(&adev->mman.gtt_window_lock); - while (num_pages) { - unsigned long cur_pages = min(min(old_size, new_size), - (u64)AMDGPU_GTT_MAX_TRANSFER_SIZE); - uint64_t from = old_start, to = new_start; - struct dma_fence *next; - if (old_mem->mem_type == TTM_PL_TT && - !amdgpu_gtt_mgr_is_allocated(old_mem)) { - r = amdgpu_map_buffer(bo, old_mem, cur_pages, - old_start, 0, ring, &from); - if (r) - goto error; - } + while (size) { + unsigned long cur_size; + uint64_t from_to[2] = {node_start[0], node_start[1]}; + struct dma_fence *next; - if (new_mem->mem_type == TTM_PL_TT && - !amdgpu_gtt_mgr_is_allocated(new_mem)) { - r = amdgpu_map_buffer(bo, new_mem, cur_pages, - new_start, 1, ring, &to); - if (r) - goto error; + /* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst + * begins at an offset, then adjust the size accordingly + */ + cur_size = min3(min(node_size[0], node_size[1]), size, + GTT_MAX_BYTES); + if (cur_size + page_offset[0] > GTT_MAX_BYTES || + cur_size + page_offset[1] > GTT_MAX_BYTES) + cur_size -= max(page_offset[0], page_offset[1]); + + /* Map only what needs to be accessed */ + for (i = 0; i < 2; i++) { + if (mem[i]->mem_type == TTM_PL_TT && + !amdgpu_gtt_mgr_is_allocated(mem[i])) { + r = amdgpu_map_buffer(bo[i], mem[i], + PFN_UP(cur_size + page_offset[i]), + node_start[i], i, ring, + &from_to[i]); + if (r) + goto error; + /* Adjust the offset because amdgpu_map_buffer + * returns start of mapped page + */ + from_to[i] += page_offset[i]; + } } - r = amdgpu_copy_buffer(ring, from, to, - cur_pages * PAGE_SIZE, - bo->resv, &next, false, true); + r = amdgpu_copy_buffer(ring, from_to[0], from_to[1], cur_size, + resv, &next, false, true); if (r) goto error; dma_fence_put(fence); fence = next; - num_pages -= cur_pages; - if (!num_pages) + size -= cur_size; + if (!size) break; - old_size -= cur_pages; - if (!old_size) { - old_start = amdgpu_mm_node_addr(bo, ++old_mm, old_mem); - old_size = old_mm->size; - } else { - old_start += cur_pages * PAGE_SIZE; - } - - new_size -= cur_pages; - if (!new_size) { - new_start = amdgpu_mm_node_addr(bo, ++new_mm, new_mem); - new_size = new_mm->size; - } else { - new_start += cur_pages * PAGE_SIZE; + for (i = 0; i < 2; i++) { + node_size[i] -= cur_size; + if (!node_size[i]) { + node_start[i] = amdgpu_mm_node_addr(bo[i], + ++mm[i], mem[i]); + node_size[i] = (mm[i]->size << PAGE_SHIFT); + } else { + node_start[i] += cur_size; + page_offset[i] = node_start[i] & + (PAGE_SIZE - 1); + } } } +error: mutex_unlock(&adev->mman.gtt_window_lock); + if (f) + *f = dma_fence_get(fence); + dma_fence_put(fence); + return r; +} + + +static int amdgpu_move_blit(struct ttm_buffer_object *bo, + bool evict, bool no_wait_gpu, + struct ttm_mem_reg *new_mem, + struct ttm_mem_reg *old_mem) +{ + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); + struct ttm_buffer_object *bo_arr[2] = {bo, bo}; + struct ttm_mem_reg *mem[2] = {old_mem, new_mem}; + uint64_t offset[2] = {0, 0}; + struct dma_fence *fence = NULL; + int r; + + r = amdgpu_copy_ttm_mem_to_mem(adev, bo_arr, mem, offset, + new_mem->num_pages << PAGE_SHIFT, + bo->resv, &fence); + if (r) + goto error; r = ttm_bo_pipeline_move(bo, fence, evict, new_mem); dma_fence_put(fence); return r; error: - mutex_unlock(&adev->mman.gtt_window_lock); - if (fence) dma_fence_wait(fence, false); dma_fence_put(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 7abae68..baab6a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -72,6 +72,13 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, struct reservation_object *resv, struct dma_fence **fence, bool direct_submit, bool vm_needs_flush); +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, + struct ttm_buffer_object *bo[2], + struct ttm_mem_reg *mem[2], + uint64_t offset[2], + uint64_t size, + struct reservation_object *resv, + struct dma_fence **f); int amdgpu_fill_buffer(struct amdgpu_bo *bo, uint64_t src_data, struct reservation_object *resv, -- 1.9.1