Am 07.10.2017 um 00:08 schrieb Harish Kasiviswanathan: > Add more generic function amdgpu_copy_ttm_mem_to_mem() that supports > arbitrary copy size, offsets and two BOs (source & dest.). > > This is useful for KFD Cross Memory Attach feature where data needs to > be copied from BOs from different processes > > v2: Add struct amdgpu_copy_mem and changed amdgpu_copy_ttm_mem_to_mem() > function parameters to use the struct > > Change-Id: I848d541a84a1c2d12827d9dcf6d9054d854b4159 > Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan at amd.com> Looks like a nice cleanup to me, with the notes below fixed the patch is Reviewed-by: Christian König <christian.koenig at amd.com>. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 169 +++++++++++++++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 12 +++ > 2 files changed, 132 insertions(+), 49 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index 1086f03..dda4f08 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -289,97 +289,168 @@ static uint64_t amdgpu_mm_node_addr(struct ttm_buffer_object *bo, > return addr; > } > > -static int amdgpu_move_blit(struct ttm_buffer_object *bo, > - bool evict, bool no_wait_gpu, > - struct ttm_mem_reg *new_mem, > - struct ttm_mem_reg *old_mem) > +/** > + * amdgpu_copy_ttm_mem_to_mem - Helper function for copy > + * > + * The function copies @size bytes from {src->mem + src->offset} to > + * {dst->mem + dst->offset}. src->bo and dst->bo could be same BO for a > + * move and different for a BO to BO copy. > + * > + * @f: Returns the last fence if multiple jobs are submitted. > + */ > +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, Please name that amdgpu_ttm_copy_.... to note that the function is part of the TTM subsystem in amdgpu. > + struct amdgpu_copy_mem *src, > + struct amdgpu_copy_mem *dst, > + uint64_t size, > + struct reservation_object *resv, > + struct dma_fence **f) > { > - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > - > - struct drm_mm_node *old_mm, *new_mm; > - uint64_t old_start, old_size, new_start, new_size; > - unsigned long num_pages; > + struct drm_mm_node *src_mm, *dst_mm; > + uint64_t src_node_start, dst_node_start, src_node_size, > + dst_node_size, src_page_offset, dst_page_offset; > struct dma_fence *fence = NULL; > - int r; > - > - BUILD_BUG_ON((PAGE_SIZE % AMDGPU_GPU_PAGE_SIZE) != 0); > + int r = 0; > + const uint64_t GTT_MAX_BYTES = (AMDGPU_GTT_MAX_TRANSFER_SIZE * > + AMDGPU_GPU_PAGE_SIZE); > > if (!ring->ready) { > DRM_ERROR("Trying to move memory with ring turned off.\n"); > return -EINVAL; > } > > - old_mm = old_mem->mm_node; > - old_size = old_mm->size; > - old_start = amdgpu_mm_node_addr(bo, old_mm, old_mem); > + src_mm = src->mem->mm_node; > + while (src->offset >= (src_mm->size << PAGE_SHIFT)) { > + src->offset -= (src_mm->size << PAGE_SHIFT); > + ++src_mm; > + } > + src_node_start = amdgpu_mm_node_addr(src->bo, src_mm, src->mem) + > + src->offset; > + src_node_size = (src_mm->size << PAGE_SHIFT) - src->offset; > + src_page_offset = src_node_start & (PAGE_SIZE - 1); > > - new_mm = new_mem->mm_node; > - new_size = new_mm->size; > - new_start = amdgpu_mm_node_addr(bo, new_mm, new_mem); > + dst_mm = dst->mem->mm_node; > + while (dst->offset >= (dst_mm->size << PAGE_SHIFT)) { > + dst->offset -= (dst_mm->size << PAGE_SHIFT); > + ++dst_mm; > + } > + dst_node_start = amdgpu_mm_node_addr(dst->bo, dst_mm, dst->mem) + > + dst->offset; > + dst_node_size = (dst_mm->size << PAGE_SHIFT) - dst->offset; > + dst_page_offset = dst_node_start & (PAGE_SIZE - 1); > > - num_pages = new_mem->num_pages; > mutex_lock(&adev->mman.gtt_window_lock); > - while (num_pages) { > - unsigned long cur_pages = min(min(old_size, new_size), > - (u64)AMDGPU_GTT_MAX_TRANSFER_SIZE); > - uint64_t from = old_start, to = new_start; > + > + while (size) { > + unsigned long cur_size; > + uint64_t from = src_node_start, to = dst_node_start; > struct dma_fence *next; > > - if (old_mem->mem_type == TTM_PL_TT && > - !amdgpu_gtt_mgr_is_allocated(old_mem)) { > - r = amdgpu_map_buffer(bo, old_mem, cur_pages, > - old_start, 0, ring, &from); > + /* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst > + * begins at an offset, then adjust the size accordingly > + */ > + cur_size = min3(min(src_node_size, dst_node_size), size, > + GTT_MAX_BYTES); > + if (cur_size + src_page_offset > GTT_MAX_BYTES || > + cur_size + dst_page_offset > GTT_MAX_BYTES) > + cur_size -= max(src_page_offset, dst_page_offset); > + > + /* Map only what needs to be accessed. Map src to window 0 and > + * dst to window 1 > + */ > + if (src->mem->mem_type == TTM_PL_TT && > + !amdgpu_gtt_mgr_is_allocated(src->mem)) { > + r = amdgpu_map_buffer(src->bo, src->mem, > + PFN_UP(cur_size + src_page_offset), > + src_node_start, 0, ring, > + &from); > if (r) > goto error; > + /* Adjust the offset because amdgpu_map_buffer returns > + * start of mapped page > + */ > + from += src_page_offset; > } > > - if (new_mem->mem_type == TTM_PL_TT && > - !amdgpu_gtt_mgr_is_allocated(new_mem)) { > - r = amdgpu_map_buffer(bo, new_mem, cur_pages, > - new_start, 1, ring, &to); > + if (dst->mem->mem_type == TTM_PL_TT && > + !amdgpu_gtt_mgr_is_allocated(dst->mem)) { > + r = amdgpu_map_buffer(dst->bo, dst->mem, > + PFN_UP(cur_size + dst_page_offset), > + dst_node_start, 1, ring, > + &to); > if (r) > goto error; > + to += dst_page_offset; > } > > - r = amdgpu_copy_buffer(ring, from, to, > - cur_pages * PAGE_SIZE, > - bo->resv, &next, false, true); > + r = amdgpu_copy_buffer(ring, from, to, cur_size, > + resv, &next, false, true); > if (r) > goto error; > > dma_fence_put(fence); > fence = next; > > - num_pages -= cur_pages; > - if (!num_pages) > + size -= cur_size; > + if (!size) > break; > > - old_size -= cur_pages; > - if (!old_size) { > - old_start = amdgpu_mm_node_addr(bo, ++old_mm, old_mem); > - old_size = old_mm->size; > + src_node_size -= cur_size; > + if (!src_node_size) { > + src_node_start = amdgpu_mm_node_addr(src->bo, ++src_mm, > + src->mem); > + src_node_size = (src_mm->size << PAGE_SHIFT); > } else { > - old_start += cur_pages * PAGE_SIZE; > + src_node_start += cur_size; > + src_page_offset = src_node_start & (PAGE_SIZE - 1); > } > - > - new_size -= cur_pages; > - if (!new_size) { > - new_start = amdgpu_mm_node_addr(bo, ++new_mm, new_mem); > - new_size = new_mm->size; > + dst_node_size -= cur_size; > + if (!dst_node_size) { > + dst_node_start = amdgpu_mm_node_addr(dst->bo, ++dst_mm, > + dst->mem); > + dst_node_size = (dst_mm->size << PAGE_SHIFT); > } else { > - new_start += cur_pages * PAGE_SIZE; > + dst_node_start += cur_size; > + dst_page_offset = dst_node_start & (PAGE_SIZE - 1); > } > } > +error: > mutex_unlock(&adev->mman.gtt_window_lock); > + if (f) > + *f = dma_fence_get(fence); > + dma_fence_put(fence); > + return r; > +} > + > + > +static int amdgpu_move_blit(struct ttm_buffer_object *bo, > + bool evict, bool no_wait_gpu, > + struct ttm_mem_reg *new_mem, > + struct ttm_mem_reg *old_mem) > +{ I think I would rather prefer to change the callers of that function, but not 100% sure. How much change would that be? But not a blocking issue. > + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > + struct amdgpu_copy_mem src, dst; > + struct dma_fence *fence = NULL; > + int r; > + > + src.bo = bo; > + dst.bo = bo; > + src.mem = old_mem; > + dst.mem = new_mem; > + src.offset = 0; > + dst.offset = 0; > + > + r = amdgpu_copy_ttm_mem_to_mem(adev, &src, &dst, > + new_mem->num_pages << PAGE_SHIFT, > + bo->resv, &fence); > + if (r) > + goto error; > > r = ttm_bo_pipeline_move(bo, fence, evict, new_mem); > dma_fence_put(fence); > return r; > > error: > - mutex_unlock(&adev->mman.gtt_window_lock); > - > if (fence) > dma_fence_wait(fence, false); > dma_fence_put(fence); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 7abae68..2188034 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -58,6 +58,12 @@ struct amdgpu_mman { > struct amd_sched_entity entity; > }; > > +struct amdgpu_copy_mem { > + struct ttm_buffer_object *bo; > + struct ttm_mem_reg *mem; > + unsigned long offset; > +}; > + > extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func; > extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func; > > @@ -72,6 +78,12 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, > struct reservation_object *resv, > struct dma_fence **fence, bool direct_submit, > bool vm_needs_flush); > +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev, > + struct amdgpu_copy_mem *src, > + struct amdgpu_copy_mem *dst, > + uint64_t size, > + struct reservation_object *resv, > + struct dma_fence **f); > int amdgpu_fill_buffer(struct amdgpu_bo *bo, > uint64_t src_data, > struct reservation_object *resv,