This optimizes the generating of PTEs by walking the hierarchy only once for a range and making changes as necessary. It allows for both huge (2MB) as well giant (1GB) pages to be used on Vega and Raven. Signed-off-by: Christian König <christian.koenig at amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> Acked-by: Junwei Zhang <Jerry.Zhang at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 ++++++++++++++++++--------------- 1 file changed, 147 insertions(+), 120 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 747b6ff6fea7..328325324a1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1488,46 +1488,76 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev, } /** - * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages + * amdgpu_vm_update_huge - figure out parameters for PTE updates * - * @p: see amdgpu_pte_update_params definition - * @entry: vm_pt entry to check - * @parent: parent entry - * @nptes: number of PTEs updated with this operation - * @dst: destination address where the PTEs should point to - * @flags: access flags fro the PTEs - * - * Check if we can update the PD with a huge page. + * Make sure to set the right flags for the PTEs at the desired level. */ -static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p, - struct amdgpu_vm_pt *entry, - struct amdgpu_vm_pt *parent, - unsigned nptes, uint64_t dst, - uint64_t flags) -{ - uint64_t pde; +static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params *params, + struct amdgpu_bo *bo, unsigned level, + uint64_t pe, uint64_t addr, + unsigned count, uint32_t incr, + uint64_t flags) - /* In the case of a mixed PT the PDE must point to it*/ - if (p->adev->asic_type >= CHIP_VEGA10 && !p->src && - nptes == AMDGPU_VM_PTE_COUNT(p->adev)) { - /* Set the huge page flag to stop scanning at this PDE */ +{ + if (level != AMDGPU_VM_PTB) { flags |= AMDGPU_PDE_PTE; + amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags); } - if (!(flags & AMDGPU_PDE_PTE)) { - if (entry->huge) { - /* Add the entry to the relocated list to update it. */ - entry->huge = false; - amdgpu_vm_bo_relocated(&entry->base); - } + amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags); +} + +/** + * amdgpu_vm_fragment - get fragment for PTEs + * + * @params: see amdgpu_pte_update_params definition + * @start: first PTE to handle + * @end: last PTE to handle + * @flags: hw mapping flags + * @frag: resulting fragment size + * @frag_end: end of this fragment + * + * Returns the first possible fragment for the start and end address. + */ +static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params, + uint64_t start, uint64_t end, uint64_t flags, + unsigned int *frag, uint64_t *frag_end) +{ + /** + * The MC L1 TLB supports variable sized pages, based on a fragment + * field in the PTE. When this field is set to a non-zero value, page + * granularity is increased from 4KB to (1 << (12 + frag)). The PTE + * flags are considered valid for all PTEs within the fragment range + * and corresponding mappings are assumed to be physically contiguous. + * + * The L1 TLB can store a single PTE for the whole fragment, + * significantly increasing the space available for translation + * caching. This leads to large improvements in throughput when the + * TLB is under pressure. + * + * The L2 TLB distributes small and large fragments into two + * asymmetric partitions. The large fragment cache is significantly + * larger. Thus, we try to use large fragments wherever possible. + * Userspace can support this by aligning virtual base address and + * allocation size to the fragment size. + */ + unsigned max_frag = params->adev->vm_manager.fragment_size; + + /* system pages are non continuously */ + if (params->src || !(flags & AMDGPU_PTE_VALID)) { + *frag = 0; + *frag_end = end; return; } - entry->huge = true; - amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags); - - pde = (entry - parent->entries) * 8; - amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags); + /* This intentionally wraps around if no bit is set */ + *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - start) - 1); + if (*frag >= max_frag) { + *frag = max_frag; + *frag_end = end & ~((1ULL << max_frag) - 1); + } else { + *frag_end = start + (1 << *frag); + } } /** @@ -1545,108 +1575,105 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p, * 0 for success, -EINVAL for failure. */ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, - uint64_t start, uint64_t end, - uint64_t dst, uint64_t flags) + uint64_t start, uint64_t end, + uint64_t dst, uint64_t flags) { struct amdgpu_device *adev = params->adev; - const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1; struct amdgpu_vm_pt_cursor cursor; + uint64_t frag_start = start, frag_end; + unsigned int frag; - /* walk over the address space and update the page tables */ - for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, cursor) { + /* figure out the initial fragment */ + amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end); + + /* walk over the address space and update the PTs */ + amdgpu_vm_pt_start(adev, params->vm, start, &cursor); + while (cursor.pfn < end) { struct amdgpu_bo *pt = cursor.entry->base.bo; - uint64_t pe_start; - unsigned nptes; + unsigned shift, parent_shift, num_entries; + uint64_t incr, entry_end, pe_start; - if (!pt || cursor.level != AMDGPU_VM_PTB) + if (!pt) return -ENOENT; - if ((cursor.pfn & ~mask) == (end & ~mask)) - nptes = end - cursor.pfn; - else - nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask); - - amdgpu_vm_handle_huge_pages(params, cursor.entry, cursor.parent, - nptes, dst, flags); - /* We don't need to update PTEs for huge pages */ - if (cursor.entry->huge) { - dst += nptes * AMDGPU_GPU_PAGE_SIZE; + /* The root level can't be a huge page */ + if (cursor.level == adev->vm_manager.root_level) { + if (!amdgpu_vm_pt_descendant(adev, &cursor)) + return -ENOENT; continue; } - pe_start = (cursor.pfn & mask) * 8; - amdgpu_vm_update_func(params, pt, pe_start, dst, nptes, - AMDGPU_GPU_PAGE_SIZE, flags); - dst += nptes * AMDGPU_GPU_PAGE_SIZE; - } - - return 0; -} + /* First check if the entry is already handled */ + if (cursor.pfn < frag_start) { + cursor.entry->huge = true; + amdgpu_vm_pt_next(adev, &cursor); + continue; + } -/* - * amdgpu_vm_frag_ptes - add fragment information to PTEs - * - * @params: see amdgpu_pte_update_params definition - * @vm: requested vm - * @start: first PTE to handle - * @end: last PTE to handle - * @dst: addr those PTEs should point to - * @flags: hw mapping flags - * - * Returns: - * 0 for success, -EINVAL for failure. - */ -static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, - uint64_t start, uint64_t end, - uint64_t dst, uint64_t flags) -{ - /** - * The MC L1 TLB supports variable sized pages, based on a fragment - * field in the PTE. When this field is set to a non-zero value, page - * granularity is increased from 4KB to (1 << (12 + frag)). The PTE - * flags are considered valid for all PTEs within the fragment range - * and corresponding mappings are assumed to be physically contiguous. - * - * The L1 TLB can store a single PTE for the whole fragment, - * significantly increasing the space available for translation - * caching. This leads to large improvements in throughput when the - * TLB is under pressure. - * - * The L2 TLB distributes small and large fragments into two - * asymmetric partitions. The large fragment cache is significantly - * larger. Thus, we try to use large fragments wherever possible. - * Userspace can support this by aligning virtual base address and - * allocation size to the fragment size. - */ - unsigned max_frag = params->adev->vm_manager.fragment_size; - int r; + /* If it isn't already handled it can't be a huge page */ + if (cursor.entry->huge) { + /* Add the entry to the relocated list to update it. */ + cursor.entry->huge = false; + amdgpu_vm_bo_relocated(&cursor.entry->base); + } - /* system pages are non continuously */ - if (params->src || !(flags & AMDGPU_PTE_VALID)) - return amdgpu_vm_update_ptes(params, start, end, dst, flags); - - while (start != end) { - uint64_t frag_flags, frag_end; - unsigned frag; - - /* This intentionally wraps around if no bit is set */ - frag = min((unsigned)ffs(start) - 1, - (unsigned)fls64(end - start) - 1); - if (frag >= max_frag) { - frag_flags = AMDGPU_PTE_FRAG(max_frag); - frag_end = end & ~((1ULL << max_frag) - 1); - } else { - frag_flags = AMDGPU_PTE_FRAG(frag); - frag_end = start + (1 << frag); + shift = amdgpu_vm_level_shift(adev, cursor.level); + parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1); + if (adev->asic_type < CHIP_VEGA10) { + /* No huge page support before GMC v9 */ + if (cursor.level != AMDGPU_VM_PTB) { + if (!amdgpu_vm_pt_descendant(adev, &cursor)) + return -ENOENT; + continue; + } + } else if (frag < shift) { + /* We can't use this level when the fragment size is + * smaller than the address shift. Go to the next + * child entry and try again. + */ + if (!amdgpu_vm_pt_descendant(adev, &cursor)) + return -ENOENT; + continue; + } else if (frag >= parent_shift) { + /* If the fragment size is even larger than the parent + * shift we should go up one level and check it again. + */ + if (!amdgpu_vm_pt_ancestor(&cursor)) + return -ENOENT; + continue; } - r = amdgpu_vm_update_ptes(params, start, frag_end, dst, - flags | frag_flags); - if (r) - return r; + /* Looks good so far, calculate parameters for the update */ + incr = AMDGPU_GPU_PAGE_SIZE << shift; + num_entries = amdgpu_vm_num_entries(adev, cursor.level); + pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8; + entry_end = num_entries << shift; + entry_end += cursor.pfn & ~(entry_end - 1); + entry_end = min(entry_end, end); + + do { + uint64_t upd_end = min(entry_end, frag_end); + unsigned nptes = (upd_end - frag_start) >> shift; + + amdgpu_vm_update_huge(params, pt, cursor.level, + pe_start, dst, nptes, incr, + flags | AMDGPU_PTE_FRAG(frag)); + + pe_start += nptes * 8; + dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift; + + frag_start = upd_end; + if (frag_start >= frag_end) { + /* figure out the next fragment */ + amdgpu_vm_fragment(params, frag_start, end, + flags, &frag, &frag_end); + if (frag < shift) + break; + } + } while (frag_start < entry_end); - dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; - start = frag_end; + if (frag >= shift) + amdgpu_vm_pt_next(adev, &cursor); } return 0; @@ -1708,8 +1735,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, params.func = amdgpu_vm_cpu_set_ptes; params.pages_addr = pages_addr; - return amdgpu_vm_frag_ptes(¶ms, start, last + 1, - addr, flags); + return amdgpu_vm_update_ptes(¶ms, start, last + 1, + addr, flags); } ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched); @@ -1788,7 +1815,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, if (r) goto error_free; - r = amdgpu_vm_frag_ptes(¶ms, start, last + 1, addr, flags); + r = amdgpu_vm_update_ptes(¶ms, start, last + 1, addr, flags); if (r) goto error_free; -- 2.14.1