One comment inline. With that fixed, this patch is Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> On 2017-08-30 09:48 AM, Christian König wrote: > From: Roger He <Hongbo.He at amd.com> > > This can improve performance for some cases. > > v2 (chk): handle all sizes, simplify the patch quite a bit > v3 (chk): adjust dw estimation as well > > Signed-off-by: Roger He <Hongbo.He at amd.com> > Signed-off-by: Christian König <christian.koenig at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 74 ++++++++++++++++++++++------------ > 1 file changed, 49 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index b08f031..1575657 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1415,8 +1415,6 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, > uint64_t start, uint64_t end, > uint64_t dst, uint64_t flags) > { > - int r; > - > /** > * The MC L1 TLB supports variable sized pages, based on a fragment > * field in the PTE. When this field is set to a non-zero value, page > @@ -1435,39 +1433,65 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, > * Userspace can support this by aligning virtual base address and > * allocation size to the fragment size. > */ > - unsigned pages_per_frag = params->adev->vm_manager.fragment_size; > - uint64_t frag_flags = AMDGPU_PTE_FRAG(pages_per_frag); > - uint64_t frag_align = 1 << pages_per_frag; > + unsigned max_frag = params->adev->vm_manager.fragment_size; > + uint64_t frag_flags, frag_end; > + unsigned frag; > > - uint64_t frag_start = ALIGN(start, frag_align); > - uint64_t frag_end = end & ~(frag_align - 1); > + int r; > > /* system pages are non continuously */ > - if (params->src || !(flags & AMDGPU_PTE_VALID) || > - (frag_start >= frag_end)) > + if (params->src || !(flags & AMDGPU_PTE_VALID)) > return amdgpu_vm_update_ptes(params, start, end, dst, flags); > > - /* handle the 4K area at the beginning */ > - if (start != frag_start) { > - r = amdgpu_vm_update_ptes(params, start, frag_start, > - dst, flags); > + /* Handle the fragments at the beginning */ > + while (start != end) { > + /* This intentionally wraps around if no bit is set */ > + frag = min(ffs(start), fls64(end - start)) - 1; > + if (frag >= max_frag) > + break; > + > + frag_flags = AMDGPU_PTE_FRAG(frag); > + frag_end = start + (1 << frag); > + > + r = amdgpu_vm_update_ptes(params, start, frag_end, dst, > + flags | frag_flags); > if (r) > return r; > - dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE; > + > + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; > + start = frag_end; > } > > /* handle the area in the middle */ > - r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst, > - flags | frag_flags); > - if (r) > - return r; > + if (start != end) { > + frag_flags = AMDGPU_PTE_FRAG(max_frag); > + frag_end = end & ~((1 << max_frag) - 1); You need a cast to uint64_t to make sure your mask is big enough and doesn't wipe out the high address bits: frag_end = end & ~(uint64_t)((1 << max_frag) - 1) > > - /* handle the 4K area at the end */ > - if (frag_end != end) { > - dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE; > - r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags); > + r = amdgpu_vm_update_ptes(params, start, frag_end, dst, > + flags | frag_flags); > + if (r) > + return r; > + > + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; > + start = frag_end; > } > - return r; > + > + /* Handle the fragments at the end */ > + while (start != end) { > + frag = fls64(end - start) - 1; > + frag_flags = AMDGPU_PTE_FRAG(frag); > + frag_end = start + (1 << frag); > + > + r = amdgpu_vm_update_ptes(params, start, frag_end, > + dst, flags | frag_flags); > + if (r) > + return r; > + > + dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; > + start = frag_end; > + } > + > + return 0; > } > > /** > @@ -1557,8 +1581,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, > /* set page commands needed */ > ndw += ncmds * 10; > > - /* two extra commands for begin/end of fragment */ > - ndw += 2 * 10; > + /* extra commands for begin/end fragments */ > + ndw += 2 * 10 * adev->vm_manager.fragment_size; > > params.func = amdgpu_vm_do_set_ptes; > }