On Tue, 5 Apr 2022 at 13:00, Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> wrote: > > When DMAR / VT-d is enabled, the display engine uses overfetching, > presumably to deal with the increased latency. To avoid display engine > errors and DMAR faults, as a workaround the GGTT is populated with scatch > PTEs when VT-d is enabled. However starting with gen10, Write-combined > writing of scratch PTES is no longer possible and as a result, populating > the full GGTT with scratch PTEs like on resume becomes very slow as > uncached access is needed. > > Therefore, on integrated GPUs utilize the fact that the PTEs are stored in > stolen memory which retain content across S3 suspend. Don't clear the PTEs > on suspend and resume. This improves on resume time with around 100 ms. > While 100+ms might appear like a short time it's 10% to 20% of total resume > time and important in some applications. > > One notable exception is Intel Rapid Start Technology which may cause > stolen memory to be lost across what the OS percieves as S3 suspend. > If IRST is enabled or if we can't detect whether IRST is enabled, retain > the old workaround, clearing and re-instating PTEs. > > As an additional measure, if we detect that the last ggtt pte was lost > during suspend, print a warning and re-populate the GGTT ptes > > On discrete GPUs, the display engine scans out from LMEM which isn't > subject to DMAR, and presumably the workaround is therefore not needed, > but that needs to be verified and disabling the workaround for dGPU, > if possible, will be deferred to a follow-up patch. > > v2: > - Rely on retained ptes to also speed up suspend and resume re-binding. > - Re-build GGTT ptes if Intel rst is enabled. > v3: > - Re-build GGTT ptes also if we can't detect whether Intel rst is enabled, > and if the guard page PTE and end of GGTT was lost. > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> It's not great, but I think it's better than building another complication into our allocator just to handle this all. Acked-by: Daniel Vetter <daniel.vetter@xxxxxxxx> > --- > drivers/gpu/drm/i915/gt/intel_ggtt.c | 56 +++++++++++++++++++++++++--- > drivers/gpu/drm/i915/gt/intel_gtt.h | 20 ++++++++++ > drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++ > 3 files changed, 86 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c > index 04191fe2ee34..98441b1c1b75 100644 > --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c > +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c > @@ -23,6 +23,13 @@ > #include "intel_gtt.h" > #include "gen8_ppgtt.h" > > +static inline bool suspend_retains_ptes(struct i915_address_space *vm) > +{ > + return GRAPHICS_VER(vm->i915) >= 8 && > + !HAS_LMEM(vm->i915) && > + vm->is_ggtt; > +} > + > static void i915_ggtt_color_adjust(const struct drm_mm_node *node, > unsigned long color, > u64 *start, > @@ -116,6 +123,23 @@ static bool needs_idle_maps(struct drm_i915_private *i915) > return false; > } > > +/* > + * Return the value of the last GGTT pte cast to an u64, if > + * the system is supposed to retain ptes across resume. 0 otherwise. > + */ > +static u64 read_last_pte(struct i915_address_space *vm) > +{ > + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > + gen8_pte_t __iomem *ptep; > + > + if (!suspend_retains_ptes(vm)) > + return 0; > + > + GEM_BUG_ON(GRAPHICS_VER(vm->i915) < 8); > + ptep = (typeof(ptep))ggtt->gsm + (ggtt_total_entries(ggtt) - 1); > + return readq(ptep); > +} > + > /** > * i915_ggtt_suspend_vm - Suspend the memory mappings for a GGTT or DPT VM > * @vm: The VM to suspend the mappings for > @@ -179,7 +203,10 @@ void i915_ggtt_suspend_vm(struct i915_address_space *vm) > i915_gem_object_unlock(obj); > } > > - vm->clear_range(vm, 0, vm->total); > + if (!suspend_retains_ptes(vm)) > + vm->clear_range(vm, 0, vm->total); > + else > + i915_vm_to_ggtt(vm)->probed_pte = read_last_pte(vm); > > vm->skip_pte_rewrite = save_skip_rewrite; > > @@ -578,6 +605,8 @@ static int init_ggtt(struct i915_ggtt *ggtt) > struct drm_mm_node *entry; > int ret; > > + ggtt->pte_lost = true; > + > /* > * GuC requires all resources that we're sharing with it to be placed in > * non-WOPCM memory. If GuC is not present or not in use we still need a > @@ -1309,11 +1338,20 @@ bool i915_ggtt_resume_vm(struct i915_address_space *vm) > { > struct i915_vma *vma; > bool write_domain_objs = false; > + bool retained_ptes; > > drm_WARN_ON(&vm->i915->drm, !vm->is_ggtt && !vm->is_dpt); > > - /* First fill our portion of the GTT with scratch pages */ > - vm->clear_range(vm, 0, vm->total); > + /* > + * First fill our portion of the GTT with scratch pages if > + * they were not retained across suspend. > + */ > + retained_ptes = suspend_retains_ptes(vm) && > + !i915_vm_to_ggtt(vm)->pte_lost && > + !GEM_WARN_ON(i915_vm_to_ggtt(vm)->probed_pte != read_last_pte(vm)); > + > + if (!retained_ptes) > + vm->clear_range(vm, 0, vm->total); > > /* clflush objects bound into the GGTT and rebind them. */ > list_for_each_entry(vma, &vm->bound_list, vm_link) { > @@ -1322,9 +1360,10 @@ bool i915_ggtt_resume_vm(struct i915_address_space *vm) > atomic_read(&vma->flags) & I915_VMA_BIND_MASK; > > GEM_BUG_ON(!was_bound); > - vma->ops->bind_vma(vm, NULL, vma->resource, > - obj ? obj->cache_level : 0, > - was_bound); > + if (!retained_ptes) > + vma->ops->bind_vma(vm, NULL, vma->resource, > + obj ? obj->cache_level : 0, > + was_bound); > if (obj) { /* only used during resume => exclusive access */ > write_domain_objs |= fetch_and_zero(&obj->write_domain); > obj->read_domains |= I915_GEM_DOMAIN_GTT; > @@ -1352,3 +1391,8 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt) > > intel_ggtt_restore_fences(ggtt); > } > + > +void i915_ggtt_mark_pte_lost(struct drm_i915_private *i915, bool val) > +{ > + to_gt(i915)->ggtt->pte_lost = val; > +} > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h > index 4529b5e9f6e6..7561672c4f17 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gtt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h > @@ -345,6 +345,15 @@ struct i915_ggtt { > > bool do_idle_maps; > > + /** > + * Whether the system was recently restored from hibernate and > + * thus may have lost pte content. > + */ > + bool pte_lost; > + > + /** Probed pte value on suspend. Re-checked on resume. */ > + u64 probed_pte; > + > int mtrr; > > /** Bit 6 swizzling required for X tiling */ > @@ -571,6 +580,17 @@ bool i915_ggtt_resume_vm(struct i915_address_space *vm); > void i915_ggtt_suspend(struct i915_ggtt *gtt); > void i915_ggtt_resume(struct i915_ggtt *ggtt); > > +/** > + * i915_ggtt_mark_pte_lost - Mark ggtt ptes as lost or clear such a marking > + * @i915 The device private. > + * @val whether the ptes should be marked as lost. > + * > + * In some cases pte content is retained across suspend, but typically lost > + * across hibernate. Typically they should be marked as lost on > + * hibernation restore and such marking cleared on suspend. > + */ > +void i915_ggtt_mark_pte_lost(struct drm_i915_private *i915, bool val); > + > void > fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count); > > diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c > index 64e6f76861f9..f50256e4c2d2 100644 > --- a/drivers/gpu/drm/i915/i915_driver.c > +++ b/drivers/gpu/drm/i915/i915_driver.c > @@ -98,6 +98,9 @@ > #include "intel_region_ttm.h" > #include "vlv_suspend.h" > > +/* Intel Rapid Start Technology ACPI device name */ > +static const char irst_name[] = "INT3392"; > + > static const struct drm_driver i915_drm_driver; > > static int i915_get_bridge_dev(struct drm_i915_private *dev_priv) > @@ -1425,6 +1428,8 @@ static int i915_pm_suspend(struct device *kdev) > return -ENODEV; > } > > + i915_ggtt_mark_pte_lost(i915, false); > + > if (i915->drm.switch_power_state == DRM_SWITCH_POWER_OFF) > return 0; > > @@ -1477,6 +1482,14 @@ static int i915_pm_resume(struct device *kdev) > if (i915->drm.switch_power_state == DRM_SWITCH_POWER_OFF) > return 0; > > + /* > + * If IRST is enabled, or if we can't detect whether it's enabled, > + * then we must assume we lost the GGTT page table entries, since > + * they are not retained if IRST decided to enter S4. > + */ > + if (!IS_ENABLED(CONFIG_ACPI) || acpi_dev_present(irst_name, NULL, -1)) > + i915_ggtt_mark_pte_lost(i915, true); > + > return i915_drm_resume(&i915->drm); > } > > @@ -1536,6 +1549,9 @@ static int i915_pm_restore_early(struct device *kdev) > > static int i915_pm_restore(struct device *kdev) > { > + struct drm_i915_private *i915 = kdev_to_i915(kdev); > + > + i915_ggtt_mark_pte_lost(i915, true); > return i915_pm_resume(kdev); > } > > -- > 2.34.1 > -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch