From: Roger He <Hongbo.He@xxxxxxx> This can improve performance for some cases. v2 (chk): handle all sizes, simplify the patch quite a bit Signed-off-by: Roger He <Hongbo.He at amd.com> Signed-off-by: Christian König <christian.koenig at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 70 +++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4cdfb70..04815a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1415,8 +1415,6 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, uint64_t start, uint64_t end, uint64_t dst, uint64_t flags) { - int r; - /** * The MC L1 TLB supports variable sized pages, based on a fragment * field in the PTE. When this field is set to a non-zero value, page @@ -1435,39 +1433,65 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, * Userspace can support this by aligning virtual base address and * allocation size to the fragment size. */ - unsigned pages_per_frag = params->adev->vm_manager.fragment_size; - uint64_t frag_flags = AMDGPU_PTE_FRAG(pages_per_frag); - uint64_t frag_align = 1 << pages_per_frag; + unsigned max_frag = params->adev->vm_manager.fragment_size; + uint64_t frag_flags, frag_end; + unsigned frag; - uint64_t frag_start = ALIGN(start, frag_align); - uint64_t frag_end = end & ~(frag_align - 1); + int r; /* system pages are non continuously */ - if (params->src || !(flags & AMDGPU_PTE_VALID) || - (frag_start >= frag_end)) + if (params->src || !(flags & AMDGPU_PTE_VALID)) return amdgpu_vm_update_ptes(params, start, end, dst, flags); - /* handle the 4K area at the beginning */ - if (start != frag_start) { - r = amdgpu_vm_update_ptes(params, start, frag_start, - dst, flags); + /* Handle the fragments at the beginning */ + while (start != end) { + /* This intentionally wraps around if no bit is set */ + frag = min(ffs(start), fls64(end - start)) - 1; + if (frag >= max_frag) + break; + + frag_flags = AMDGPU_PTE_FRAG(frag); + frag_end = start + (1 << frag); + + r = amdgpu_vm_update_ptes(params, start, frag_end, dst, + flags | frag_flags); if (r) return r; - dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE; + + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; + start = frag_end; } /* handle the area in the middle */ - r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst, - flags | frag_flags); - if (r) - return r; + if (start != end) { + frag_flags = AMDGPU_PTE_FRAG(max_frag); + frag_end = end & ~((1 << max_frag) - 1); - /* handle the 4K area at the end */ - if (frag_end != end) { - dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE; - r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags); + r = amdgpu_vm_update_ptes(params, start, frag_end, dst, + flags | frag_flags); + if (r) + return r; + + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; + start = frag_end; } - return r; + + /* Handle the fragments at the end */ + while (start != end) { + frag = fls64(end - start) - 1; + frag_flags = AMDGPU_PTE_FRAG(frag); + frag_end = start + (1 << frag); + + r = amdgpu_vm_update_ptes(params, start, frag_end, + dst, flags | frag_flags); + if (r) + return r; + + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; + start = frag_end; + } + + return 0; } /** -- 2.7.4