-----Original Message----- From: Deucher, Alexander Sent: Thursday, October 05, 2017 3:10 PM To: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>; amd-gfx at lists.freedesktop.org Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com> Subject: RE: [PATCH] drm/amdgpu: Refactor amdgpu_move_blit > -----Original Message----- > From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf > Of Harish Kasiviswanathan > Sent: Thursday, October 05, 2017 2:30 PM > To: amd-gfx at lists.freedesktop.org > Cc: Kasiviswanathan, Harish > Subject: [PATCH] drm/amdgpu: Refactor amdgpu_move_blit > > Add more generic function amdgpu_copy_ttm_mem_to_mem() that supports > arbitrary copy size, offsets and two BOs (source & dest.). > > This is useful for KFD Cross Memory Attach feature where data needs to > be copied from BOs from different processes > > Change-Id: I848d541a84a1c2d12827d9dcf6d9054d854b4159 > Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 159 > ++++++++++++++++++++------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 7 ++ > 2 files changed, 107 insertions(+), 59 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index 1086f03..e5415fc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -289,97 +289,138 @@ static uint64_t amdgpu_mm_node_addr(struct > ttm_buffer_object *bo, > return addr; > } > > -static int amdgpu_move_blit(struct ttm_buffer_object *bo, > - bool evict, bool no_wait_gpu, > - struct ttm_mem_reg *new_mem, > - struct ttm_mem_reg *old_mem) > +/** > + * amdgpu_copy_ttm_mem_to_mem - Helper function for copy > + * > + * @bo, @mem and @offset: All are array of 2 items. The function > +copies > @size > + * bytes from {mem[0] + offset[0]} to {mem[1] + offset[1]}. bo[0] and > +bo[1] > + * could be same BO for a move and different for a BO to BO copy. > + * > + * @f: Returns the last fence if multiple jobs are submitted. > + */ > +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, > + struct ttm_buffer_object *bo[2], > + struct ttm_mem_reg *mem[2], > + uint64_t offset[2], > + uint64_t size, > + struct reservation_object *resv, > + struct dma_fence **f) Please document which is src and which is dst. I think from a readability standpoint, it would be better to pass explicit src and dst parameters rather than an array. Alex [HK]: I will add the comment. Yes I thought of using src & dst but the number of parameters will increase (already a large number) from 7 to 10. > { > - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > - > - struct drm_mm_node *old_mm, *new_mm; > - uint64_t old_start, old_size, new_start, new_size; > - unsigned long num_pages; > + struct drm_mm_node *mm[2]; > + uint64_t node_start[2], node_size[2], page_offset[2]; > struct dma_fence *fence = NULL; > - int r; > - > - BUILD_BUG_ON((PAGE_SIZE % AMDGPU_GPU_PAGE_SIZE) != 0); > + int i, r = 0; > + const uint64_t GTT_MAX_BYTES = > (AMDGPU_GTT_MAX_TRANSFER_SIZE * > + AMDGPU_GPU_PAGE_SIZE); > > if (!ring->ready) { > DRM_ERROR("Trying to move memory with ring turned off.\n"); > return -EINVAL; > } > > - old_mm = old_mem->mm_node; > - old_size = old_mm->size; > - old_start = amdgpu_mm_node_addr(bo, old_mm, old_mem); > - > - new_mm = new_mem->mm_node; > - new_size = new_mm->size; > - new_start = amdgpu_mm_node_addr(bo, new_mm, new_mem); > + for (i = 0; i < 2; i++) { > + mm[i] = mem[i]->mm_node; > + while (offset[i] >= (mm[i]->size << PAGE_SHIFT)) { > + offset[i] -= (mm[i]->size << PAGE_SHIFT); > + ++mm[i]; > + } > + node_start[i] = amdgpu_mm_node_addr(bo[i], mm[i], > mem[i]) + > + offset[i]; > + node_size[i] = (mm[i]->size << PAGE_SHIFT) - offset[i]; > + page_offset[i] = node_start[i] & (PAGE_SIZE - 1); > + } > > - num_pages = new_mem->num_pages; > mutex_lock(&adev->mman.gtt_window_lock); > - while (num_pages) { > - unsigned long cur_pages = min(min(old_size, new_size), > - > (u64)AMDGPU_GTT_MAX_TRANSFER_SIZE); > - uint64_t from = old_start, to = new_start; > - struct dma_fence *next; > > - if (old_mem->mem_type == TTM_PL_TT && > - !amdgpu_gtt_mgr_is_allocated(old_mem)) { > - r = amdgpu_map_buffer(bo, old_mem, cur_pages, > - old_start, 0, ring, &from); > - if (r) > - goto error; > - } > + while (size) { > + unsigned long cur_size; > + uint64_t from_to[2] = {node_start[0], node_start[1]}; > + struct dma_fence *next; > > - if (new_mem->mem_type == TTM_PL_TT && > - !amdgpu_gtt_mgr_is_allocated(new_mem)) { > - r = amdgpu_map_buffer(bo, new_mem, cur_pages, > - new_start, 1, ring, &to); > - if (r) > - goto error; > + /* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst > + * begins at an offset, then adjust the size accordingly > + */ > + cur_size = min3(min(node_size[0], node_size[1]), size, > + GTT_MAX_BYTES); > + if (cur_size + page_offset[0] > GTT_MAX_BYTES || > + cur_size + page_offset[1] > GTT_MAX_BYTES) > + cur_size -= max(page_offset[0], page_offset[1]); > + > + /* Map only what needs to be accessed */ > + for (i = 0; i < 2; i++) { > + if (mem[i]->mem_type == TTM_PL_TT && > + !amdgpu_gtt_mgr_is_allocated(mem[i])) { > + r = amdgpu_map_buffer(bo[i], mem[i], > + PFN_UP(cur_size + page_offset[i]), > + node_start[i], i, ring, > + &from_to[i]); > + if (r) > + goto error; > + /* Adjust the offset because > amdgpu_map_buffer > + * returns start of mapped page > + */ > + from_to[i] += page_offset[i]; > + } > } > > - r = amdgpu_copy_buffer(ring, from, to, > - cur_pages * PAGE_SIZE, > - bo->resv, &next, false, true); > + r = amdgpu_copy_buffer(ring, from_to[0], from_to[1], > cur_size, > + resv, &next, false, true); > if (r) > goto error; > > dma_fence_put(fence); > fence = next; > > - num_pages -= cur_pages; > - if (!num_pages) > + size -= cur_size; > + if (!size) > break; > > - old_size -= cur_pages; > - if (!old_size) { > - old_start = amdgpu_mm_node_addr(bo, ++old_mm, > old_mem); > - old_size = old_mm->size; > - } else { > - old_start += cur_pages * PAGE_SIZE; > - } > - > - new_size -= cur_pages; > - if (!new_size) { > - new_start = amdgpu_mm_node_addr(bo, > ++new_mm, new_mem); > - new_size = new_mm->size; > - } else { > - new_start += cur_pages * PAGE_SIZE; > + for (i = 0; i < 2; i++) { > + node_size[i] -= cur_size; > + if (!node_size[i]) { > + node_start[i] = > amdgpu_mm_node_addr(bo[i], > + ++mm[i], mem[i]); > + node_size[i] = (mm[i]->size << PAGE_SHIFT); > + } else { > + node_start[i] += cur_size; > + page_offset[i] = node_start[i] & > + (PAGE_SIZE - 1); > + } > } > } > +error: > mutex_unlock(&adev->mman.gtt_window_lock); > + if (f) > + *f = dma_fence_get(fence); > + dma_fence_put(fence); > + return r; > +} > + > + > +static int amdgpu_move_blit(struct ttm_buffer_object *bo, > + bool evict, bool no_wait_gpu, > + struct ttm_mem_reg *new_mem, > + struct ttm_mem_reg *old_mem) > +{ > + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > + struct ttm_buffer_object *bo_arr[2] = {bo, bo}; > + struct ttm_mem_reg *mem[2] = {old_mem, new_mem}; > + uint64_t offset[2] = {0, 0}; > + struct dma_fence *fence = NULL; > + int r; > + > + r = amdgpu_copy_ttm_mem_to_mem(adev, bo_arr, mem, offset, > + new_mem->num_pages << PAGE_SHIFT, > + bo->resv, &fence); > + if (r) > + goto error; > > r = ttm_bo_pipeline_move(bo, fence, evict, new_mem); > dma_fence_put(fence); > return r; > > error: > - mutex_unlock(&adev->mman.gtt_window_lock); > - > if (fence) > dma_fence_wait(fence, false); > dma_fence_put(fence); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 7abae68..baab6a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -72,6 +72,13 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, > uint64_t src_offset, > struct reservation_object *resv, > struct dma_fence **fence, bool direct_submit, > bool vm_needs_flush); > +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, > + struct ttm_buffer_object *bo[2], > + struct ttm_mem_reg *mem[2], > + uint64_t offset[2], > + uint64_t size, > + struct reservation_object *resv, > + struct dma_fence **f); > int amdgpu_fill_buffer(struct amdgpu_bo *bo, > uint64_t src_data, > struct reservation_object *resv, > -- > 1.9.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx