Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> This looks very elegant now. :) On 2017-08-31 05:44 AM, Christian König wrote: > From: Roger He <Hongbo.He at amd.com> > > This can improve performance for some cases. > > v2 (chk): handle all sizes, simplify the patch quite a bit > v3 (chk): adjust dw estimation as well > v4 (chk): use single loop, make end mask 64bit > > Signed-off-by: Roger He <Hongbo.He at amd.com> > Signed-off-by: Christian König <christian.koenig at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 55 ++++++++++++++++------------------ > 1 file changed, 26 insertions(+), 29 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 0379af1..4c09338 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1415,8 +1415,6 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, > uint64_t start, uint64_t end, > uint64_t dst, uint64_t flags) > { > - int r; > - > /** > * The MC L1 TLB supports variable sized pages, based on a fragment > * field in the PTE. When this field is set to a non-zero value, page > @@ -1435,39 +1433,38 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, > * Userspace can support this by aligning virtual base address and > * allocation size to the fragment size. > */ > - unsigned pages_per_frag = params->adev->vm_manager.fragment_size; > - uint64_t frag_flags = AMDGPU_PTE_FRAG(pages_per_frag); > - uint64_t frag_align = 1 << pages_per_frag; > - > - uint64_t frag_start = ALIGN(start, frag_align); > - uint64_t frag_end = end & ~(frag_align - 1); > + unsigned max_frag = params->adev->vm_manager.fragment_size; > + int r; > > /* system pages are non continuously */ > - if (params->src || !(flags & AMDGPU_PTE_VALID) || > - (frag_start >= frag_end)) > + if (params->src || !(flags & AMDGPU_PTE_VALID)) > return amdgpu_vm_update_ptes(params, start, end, dst, flags); > > - /* handle the 4K area at the beginning */ > - if (start != frag_start) { > - r = amdgpu_vm_update_ptes(params, start, frag_start, > - dst, flags); > + while (start != end) { > + uint64_t frag_flags, frag_end; > + unsigned frag; > + > + /* This intentionally wraps around if no bit is set */ > + frag = min((unsigned)ffs(start) - 1, > + (unsigned)fls64(end - start) - 1); > + if (frag >= max_frag) { > + frag_flags = AMDGPU_PTE_FRAG(max_frag); > + frag_end = end & ~((1ULL << max_frag) - 1); > + } else { > + frag_flags = AMDGPU_PTE_FRAG(frag); > + frag_end = start + (1 << frag); > + } > + > + r = amdgpu_vm_update_ptes(params, start, frag_end, dst, > + flags | frag_flags); > if (r) > return r; > - dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE; > - } > - > - /* handle the area in the middle */ > - r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst, > - flags | frag_flags); > - if (r) > - return r; > > - /* handle the 4K area at the end */ > - if (frag_end != end) { > - dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE; > - r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags); > + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; > + start = frag_end; > } > - return r; > + > + return 0; > } > > /** > @@ -1557,8 +1554,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, > /* set page commands needed */ > ndw += ncmds * 10; > > - /* two extra commands for begin/end of fragment */ > - ndw += 2 * 10; > + /* extra commands for begin/end fragments */ > + ndw += 2 * 10 * adev->vm_manager.fragment_size; > > params.func = amdgpu_vm_do_set_ptes; > }