It turns out that on production DG2/ATS HW we should have support for PS64. This feature allows to provide a 64K TLB hint at the PTE level, which is a lot more flexible than the current method of enabling 64K GTT pages for the entire page-table, since that leads to all kinds of annoying restrictions, as documented in: commit caa574ffc4aaf4f29b890223878c63e2e7772f62 Author: Matthew Auld <matthew.auld@xxxxxxxxx> Date: Sat Feb 19 00:17:49 2022 +0530 drm/i915/uapi: document behaviour for DG2 64K support On discrete platforms like DG2, we need to support a minimum page size of 64K when dealing with device local-memory. This is quite tricky for various reasons, so try to document the new implicit uapi for this. With PS64, we can now drop the 2M GTT alignment restriction, and instead only require 64K or larger when dealing with lmem. We still use the compact-pt layout when possible, but only when we are certain that this doesn't interfere with userspace. Note that this is a change in uAPI behaviour, but hopefully shouldn't be a concern (IGT is at least able to autodetect the alignment), since we are only making the GTT alignment constraint less restrictive. Based on a patch from CQ Tang. Reported-by: Michal Mrozek <michal.mrozek@xxxxxxxxx> Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx> Cc: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx> Cc: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> Cc: Stuart Summers <stuart.summers@xxxxxxxxx> Cc: Jordan Justen <jordan.l.justen@xxxxxxxxx> Cc: Yang A Shi <yang.a.shi@xxxxxxxxx> Cc: Nirmoy Das <nirmoy.das@xxxxxxxxx> --- .../gpu/drm/i915/gem/selftests/huge_pages.c | 159 +++++++++++++++++- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 61 ++++--- drivers/gpu/drm/i915/gt/intel_gtt.c | 21 +-- drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + drivers/gpu/drm/i915/i915_drv.h | 7 - drivers/gpu/drm/i915/i915_pci.c | 2 - drivers/gpu/drm/i915/i915_vma.c | 9 +- drivers/gpu/drm/i915/intel_device_info.h | 1 - drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 9 +- include/uapi/drm/i915_drm.h | 36 ++-- 10 files changed, 216 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index c570cf780079..cc26c1293208 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -1161,7 +1161,8 @@ static int igt_write_huge(struct drm_i915_private *i915, GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); size = obj->base.size; - if (obj->mm.page_sizes.sg & I915_GTT_PAGE_SIZE_64K) + if (obj->mm.page_sizes.sg & I915_GTT_PAGE_SIZE_64K && + !HAS_64K_PAGES(i915)) size = round_up(size, I915_GTT_PAGE_SIZE_2M); n = 0; @@ -1214,6 +1215,10 @@ static int igt_write_huge(struct drm_i915_private *i915, * size and ensure the vma offset is at the start of the pt * boundary, however to improve coverage we opt for testing both * aligned and unaligned offsets. + * + * With PS64 this is no longer the case, but to ensure we + * sometimes get the compact layout for smaller objects, apply + * the round_up anyway. */ if (obj->mm.page_sizes.sg & I915_GTT_PAGE_SIZE_64K) offset_low = round_down(offset_low, @@ -1411,6 +1416,7 @@ static int igt_ppgtt_sanity_check(void *arg) { SZ_2M + SZ_4K, SZ_64K | SZ_4K }, { SZ_2M + SZ_4K, SZ_2M | SZ_4K }, { SZ_2M + SZ_64K, SZ_2M | SZ_64K }, + { SZ_2M + SZ_64K, SZ_64K }, }; int i, j; int err; @@ -1540,6 +1546,156 @@ static int igt_ppgtt_compact(void *arg) return err; } +static int igt_ppgtt_mixed(void *arg) +{ + struct drm_i915_private *i915 = arg; + const unsigned long flags = PIN_OFFSET_FIXED | PIN_USER; + struct drm_i915_gem_object *obj, *on; + struct i915_gem_engines *engines; + struct i915_gem_engines_iter it; + struct i915_address_space *vm; + struct i915_gem_context *ctx; + struct intel_context *ce; + struct file *file; + I915_RND_STATE(prng); + LIST_HEAD(objects); + struct intel_memory_region *mr; + struct i915_vma *vma; + unsigned int count; + u32 i, rem, addr; + int *order; + int n, err; + + /* + * Sanity check mixing 4K and 64K pages within the same page-table via + * the new PS64 TLB hint. + */ + + if (!HAS_64K_PAGES(i915)) { + pr_info("device lacks PS64, skipping\n"); + return 0; + } + + file = mock_file(i915); + if (IS_ERR(file)) + return PTR_ERR(file); + + ctx = hugepage_ctx(i915, file); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out; + } + vm = i915_gem_context_get_eb_vm(ctx); + + i = 0; + rem = SZ_8M; + addr = 0; + do { + u32 sz; + + sz = i915_prandom_u32_max_state(min_t(u32, rem, SZ_4M), &prng); + sz = min_t(u32, rem, SZ_4K); + + mr = i915->mm.regions[INTEL_REGION_LMEM_0]; + if (i & 1) + mr = i915->mm.regions[INTEL_REGION_SMEM]; + + obj = i915_gem_object_create_region(mr, sz, 0, 0); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto out_vm; + } + + list_add_tail(&obj->st_link, &objects); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_put; + } + + addr = round_up(addr, mr->min_page_size); + err = i915_vma_pin(vma, 0, 0, addr | flags); + if (err) + goto err_put; + + if (mr->type == INTEL_MEMORY_LOCAL && + (vma->resource->page_sizes_gtt & I915_GTT_PAGE_SIZE_4K)) { + err = -EINVAL; + goto err_put; + } + + addr += obj->base.size; + rem -= sz; + i++; + } while (rem); + + n = 0; + count = 0; + for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { + count++; + if (!intel_engine_can_store_dword(ce->engine)) + continue; + + n++; + } + i915_gem_context_unlock_engines(ctx); + if (!n) + goto err_put; + + order = i915_random_order(count * count, &prng); + if (!order) { + err = -ENOMEM; + goto err_put; + } + + i = 0; + addr = 0; + engines = i915_gem_context_lock_engines(ctx); + list_for_each_entry(obj, &objects, st_link) { + u32 rnd = i915_prandom_u32_max_state(UINT_MAX, &prng); + + addr = round_up(addr, obj->mm.region->min_page_size); + + ce = engines->engines[order[i] % engines->num_engines]; + i = (i + 1) % (count * count); + if (!ce || !intel_engine_can_store_dword(ce->engine)) + continue; + + err = __igt_write_huge(ce, obj, obj->base.size, addr, 0, rnd); + if (err) + break; + + err = __igt_write_huge(ce, obj, obj->base.size, addr, + offset_in_page(rnd) / sizeof(u32), rnd + 1); + if (err) + break; + + err = __igt_write_huge(ce, obj, obj->base.size, addr, + (PAGE_SIZE / sizeof(u32)) - 1, + rnd + 2); + if (err) + break; + + addr += obj->base.size; + + cond_resched(); + } + + i915_gem_context_unlock_engines(ctx); + kfree(order); +err_put: + list_for_each_entry_safe(obj, on, &objects, st_link) { + list_del(&obj->st_link); + i915_gem_object_put(obj); + } +out_vm: + i915_vm_put(vm); +out: + fput(file); + return err; +} + static int igt_tmpfs_fallback(void *arg) { struct drm_i915_private *i915 = arg; @@ -1803,6 +1959,7 @@ int i915_gem_huge_page_live_selftests(struct drm_i915_private *i915) SUBTEST(igt_ppgtt_smoke_huge), SUBTEST(igt_ppgtt_sanity_check), SUBTEST(igt_ppgtt_compact), + SUBTEST(igt_ppgtt_mixed), }; if (!HAS_PPGTT(i915)) { diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index c7bd5d71b03e..3b0cc1e21506 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -473,6 +473,7 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm, const gen8_pte_t pte_encode = vm->pte_encode(0, cache_level, flags); unsigned int rem = sg_dma_len(iter->sg); u64 start = vma_res->start; + u64 end = start + vma_res->vma_size; GEM_BUG_ON(!i915_vm_is_4lvl(vm)); @@ -486,9 +487,10 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm, gen8_pte_t encode = pte_encode; unsigned int page_size; gen8_pte_t *vaddr; - u16 index, max; + u16 index, max, nent, i; max = I915_PDES; + nent = 1; if (vma_res->bi.page_sizes.sg & I915_GTT_PAGE_SIZE_2M && IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_2M) && @@ -500,25 +502,37 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm, vaddr = px_vaddr(pd); } else { - if (encode & GEN12_PPGTT_PTE_LM) { - GEM_BUG_ON(__gen8_pte_index(start, 0) % 16); - GEM_BUG_ON(rem < I915_GTT_PAGE_SIZE_64K); - GEM_BUG_ON(!IS_ALIGNED(iter->dma, - I915_GTT_PAGE_SIZE_64K)); - - index = __gen8_pte_index(start, 0) / 16; - page_size = I915_GTT_PAGE_SIZE_64K; - - max /= 16; - - vaddr = px_vaddr(pd); - vaddr[__gen8_pte_index(start, 1)] |= GEN12_PDE_64K; + index = __gen8_pte_index(start, 0); + page_size = I915_GTT_PAGE_SIZE; - pt->is_compact = true; - } else { - GEM_BUG_ON(pt->is_compact); - index = __gen8_pte_index(start, 0); - page_size = I915_GTT_PAGE_SIZE; + if (vma_res->bi.page_sizes.sg & I915_GTT_PAGE_SIZE_64K) { + /* + * Device local-memory on these platforms should + * always use 64K pages or larger (including GTT + * alignment), therefore if we know the whole + * page-table needs to be filled we can always + * safely use the compact-layout. Otherwise fall + * back to the TLB hint with PS64. If this is + * system memory we only bother with PS64. + */ + if ((encode & GEN12_PPGTT_PTE_LM) && + end - start >= SZ_2M && !index) { + index = __gen8_pte_index(start, 0) / 16; + page_size = I915_GTT_PAGE_SIZE_64K; + + max /= 16; + + vaddr = px_vaddr(pd); + vaddr[__gen8_pte_index(start, 1)] |= GEN12_PDE_64K; + + pt->is_compact = true; + } else if (IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_64K) && + rem >= I915_GTT_PAGE_SIZE_64K && + !(index % 16)) { + encode |= GEN12_PTE_PS64; + page_size = I915_GTT_PAGE_SIZE_64K; + nent = 16; + } } vaddr = px_vaddr(pt); @@ -526,7 +540,12 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm, do { GEM_BUG_ON(rem < page_size); - vaddr[index++] = encode | iter->dma; + + for (i = 0; i < nent; i++) { + vaddr[index++] = + encode | (iter->dma + i * + I915_GTT_PAGE_SIZE); + } start += page_size; iter->dma += page_size; @@ -742,6 +761,8 @@ static void __xehpsdv_ppgtt_insert_entry_lm(struct i915_address_space *vm, GEM_BUG_ON(!IS_ALIGNED(addr, SZ_64K)); GEM_BUG_ON(!IS_ALIGNED(offset, SZ_64K)); + /* XXX: we don't strictly need to use this layout */ + if (!pt->is_compact) { vaddr = px_vaddr(pd); vaddr[gen8_pd_index(idx, 1)] |= GEN12_PDE_64K; diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index b67831833c9a..2e3a2380c23b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -269,11 +269,7 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) memset64(vm->min_alignment, I915_GTT_MIN_ALIGNMENT, ARRAY_SIZE(vm->min_alignment)); - if (HAS_64K_PAGES(vm->i915) && NEEDS_COMPACT_PT(vm->i915) && - subclass == VM_CLASS_PPGTT) { - vm->min_alignment[INTEL_MEMORY_LOCAL] = I915_GTT_PAGE_SIZE_2M; - vm->min_alignment[INTEL_MEMORY_STOLEN_LOCAL] = I915_GTT_PAGE_SIZE_2M; - } else if (HAS_64K_PAGES(vm->i915)) { + if (HAS_64K_PAGES(vm->i915)) { vm->min_alignment[INTEL_MEMORY_LOCAL] = I915_GTT_PAGE_SIZE_64K; vm->min_alignment[INTEL_MEMORY_STOLEN_LOCAL] = I915_GTT_PAGE_SIZE_64K; } @@ -343,7 +339,8 @@ int setup_scratch_page(struct i915_address_space *vm) */ size = I915_GTT_PAGE_SIZE_4K; if (i915_vm_is_4lvl(vm) && - HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) + HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K) && + !HAS_64K_PAGES(vm->i915)) size = I915_GTT_PAGE_SIZE_64K; do { @@ -385,18 +382,6 @@ int setup_scratch_page(struct i915_address_space *vm) if (size == I915_GTT_PAGE_SIZE_4K) return -ENOMEM; - /* - * If we need 64K minimum GTT pages for device local-memory, - * like on XEHPSDV, then we need to fail the allocation here, - * otherwise we can't safely support the insertion of - * local-memory pages for this vm, since the HW expects the - * correct physical alignment and size when the page-table is - * operating in 64K GTT mode, which includes any scratch PTEs, - * since userspace can still touch them. - */ - if (HAS_64K_PAGES(vm->i915)) - return -ENOMEM; - size = I915_GTT_PAGE_SIZE_4K; } while (1); } diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index c0ca53cba9f0..062b78333fb2 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -93,6 +93,7 @@ typedef u64 gen8_pte_t; #define GEN12_GGTT_PTE_LM BIT_ULL(1) #define GEN12_PDE_64K BIT(6) +#define GEN12_PTE_PS64 BIT(8) /* * Cacheability Control is a 4-bit value. The low three bits are stored in bits diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 84a2f6b16f57..eb841daa0d63 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -902,13 +902,6 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, */ #define HAS_64K_PAGES(dev_priv) (INTEL_INFO(dev_priv)->has_64k_pages) -/* - * Set this flag when platform doesn't allow both 64k pages and 4k pages in - * the same PT. this flag means we need to support compact PT layout for the - * ppGTT when using the 64K GTT pages. - */ -#define NEEDS_COMPACT_PT(dev_priv) (INTEL_INFO(dev_priv)->needs_compact_pt) - #define HAS_IPC(dev_priv) (INTEL_INFO(dev_priv)->display.has_ipc) #define HAS_REGION(i915, i) (RUNTIME_INFO(i915)->memory_regions & (i)) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index cace897e1db1..1e880a2adc52 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -1041,7 +1041,6 @@ static const struct intel_device_info xehpsdv_info = { PLATFORM(INTEL_XEHPSDV), .display = { }, .has_64k_pages = 1, - .needs_compact_pt = 1, .has_media_ratio_mode = 1, .__runtime.platform_engine_mask = BIT(RCS0) | BIT(BCS0) | @@ -1063,7 +1062,6 @@ static const struct intel_device_info xehpsdv_info = { .has_64k_pages = 1, \ .has_guc_deprivilege = 1, \ .has_heci_pxp = 1, \ - .needs_compact_pt = 1, \ .has_media_ratio_mode = 1, \ .__runtime.platform_engine_mask = \ BIT(RCS0) | BIT(BCS0) | \ diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index f17c09ead7d7..c39488eb9eeb 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -776,12 +776,6 @@ i915_vma_insert(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, GEM_BUG_ON(!IS_ALIGNED(end, I915_GTT_PAGE_SIZE)); alignment = max(alignment, i915_vm_obj_min_alignment(vma->vm, vma->obj)); - /* - * for compact-pt we round up the reservation to prevent - * any smaller pages being used within the same PDE - */ - if (NEEDS_COMPACT_PT(vma->vm->i915)) - size = round_up(size, alignment); /* If binding the object/GGTT view requires more space than the entire * aperture has, reject it early before evicting everything in a vain @@ -820,7 +814,8 @@ i915_vma_insert(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, * forseeable future. See also i915_ggtt_offset(). */ if (upper_32_bits(end - 1) && - vma->page_sizes.sg > I915_GTT_PAGE_SIZE) { + vma->page_sizes.sg > I915_GTT_PAGE_SIZE && + !HAS_64K_PAGES(vma->vm->i915)) { /* * We can't mix 64K and 4K PTEs in the same page-table * (2M block), and so to avoid the ugliness and diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h index bc87d3156b14..58539660a1ec 100644 --- a/drivers/gpu/drm/i915/intel_device_info.h +++ b/drivers/gpu/drm/i915/intel_device_info.h @@ -146,7 +146,6 @@ enum intel_ppgtt_type { /* Keep has_* in alphabetical order */ \ func(has_64bit_reloc); \ func(has_64k_pages); \ - func(needs_compact_pt); \ func(gpu_reset_clobbers_display); \ func(has_reset_engine); \ func(has_3d_pipeline); \ diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index ea2cf1080979..27c733b00976 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -1114,15 +1114,8 @@ static int misaligned_case(struct i915_address_space *vm, struct intel_memory_re expected_node_size = expected_vma_size; if (HAS_64K_PAGES(vm->i915) && i915_gem_object_is_lmem(obj)) { - /* - * The compact-pt should expand lmem node to 2MB for the ppGTT, - * for all other cases we should only expect 64K. - */ expected_vma_size = round_up(size, I915_GTT_PAGE_SIZE_64K); - if (NEEDS_COMPACT_PT(vm->i915) && !i915_is_ggtt(vm)) - expected_node_size = round_up(size, I915_GTT_PAGE_SIZE_2M); - else - expected_node_size = round_up(size, I915_GTT_PAGE_SIZE_64K); + expected_node_size = round_up(size, I915_GTT_PAGE_SIZE_64K); } if (vma->size != expected_vma_size || vma->node.size != expected_node_size) { diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 520ad2691a99..791b4a8b6f97 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3493,27 +3493,13 @@ struct drm_i915_gem_create_ext { * * The (page-aligned) allocated size for the object will be returned. * - * DG2 64K min page size implications: + * On platforms like DG2/ATS the kernel will always use 64K or larger + * pages for I915_MEMORY_CLASS_DEVICE. The kernel also requires a + * minimum of 64K GTT alignment for such objects. * - * On discrete platforms, starting from DG2, we have to contend with GTT - * page size restrictions when dealing with I915_MEMORY_CLASS_DEVICE - * objects. Specifically the hardware only supports 64K or larger GTT - * page sizes for such memory. The kernel will already ensure that all - * I915_MEMORY_CLASS_DEVICE memory is allocated using 64K or larger page - * sizes underneath. - * - * Note that the returned size here will always reflect any required - * rounding up done by the kernel, i.e 4K will now become 64K on devices - * such as DG2. The kernel will always select the largest minimum - * page-size for the set of possible placements as the value to use when - * rounding up the @size. - * - * Special DG2 GTT address alignment requirement: - * - * The GTT alignment will also need to be at least 2M for such objects. - * - * Note that due to how the hardware implements 64K GTT page support, we - * have some further complications: + * NOTE: Previously the ABI here required a minimum GTT alignment of 2M + * due to how to how the hardware implemented 64K GTT page support, + * where we had the following complications: * * 1) The entire PDE (which covers a 2MB virtual address range), must * contain only 64K PTEs, i.e mixing 4K and 64K PTEs in the same @@ -3522,12 +3508,10 @@ struct drm_i915_gem_create_ext { * 2) We still need to support 4K PTEs for I915_MEMORY_CLASS_SYSTEM * objects. * - * To keep things simple for userland, we mandate that any GTT mappings - * must be aligned to and rounded up to 2MB. The kernel will internally - * pad them out to the next 2MB boundary. As this only wastes virtual - * address space and avoids userland having to copy any needlessly - * complicated PDE sharing scheme (coloring) and only affects DG2, this - * is deemed to be a good compromise. + * However on actual production HW this was completely changed to now + * allow setting a TLB hint at the PTE level (see PS64), which is a lot + * more flexible than the above. With this the 2M restriction was + * dropped where we now only require 64K. */ __u64 size; -- 2.37.3