On Wed, Feb 10, 2021 at 11:39:46PM +0000, Chris Wilson wrote: > VT-d may cause overfetch of the scanout PTE, both before and after the > vma (depending on the scanout orientation). bspec recommends that we > provide a tile-row in either directions, and suggests using 160 PTE, > warning that the accesses will wrap around the ends of the GGTT. > Currently, we fill the entire GGTT with scratch pages when using VT-d to > always ensure there are valid entries around every vma, including > scanout. However, writing every PTE is slow as on recent devices we > perform 8MiB of uncached writes, incurring an extra 100ms during resume. > > If instead we focus on only putting guard pages around scanout, we can > avoid touching the whole GGTT. To avoid having to introduce extra nodes > around each scanout vma, we adjust the scanout drm_mm_node to be smaller > than the allocated space, and fixup the extra PTE during dma binding. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > Cc: Matthew Auld <matthew.auld@xxxxxxxxx> > --- > drivers/gpu/drm/i915/gem/i915_gem_domain.c | 3 ++ > drivers/gpu/drm/i915/gt/intel_ggtt.c | 37 ++++++++-------------- > drivers/gpu/drm/i915/i915_gem_gtt.h | 1 + > drivers/gpu/drm/i915/i915_vma.c | 23 ++++++++++++++ > drivers/gpu/drm/i915/i915_vma_types.h | 1 + > 5 files changed, 41 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c > index 0478b069c202..9f2ccc255ca1 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c > @@ -345,6 +345,9 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, > if (ret) > goto err; > > + if (intel_scanout_needs_vtd_wa(i915)) > + flags |= PIN_VTD; > + > /* > * As the user may map the buffer once pinned in the display plane > * (e.g. libkms for the bootup splash), we have to ensure that we > diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c > index b0b8ded834f0..416f77f48561 100644 > --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c > +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c > @@ -238,6 +238,11 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, > > gte = (gen8_pte_t __iomem *)ggtt->gsm; > gte += vma->node.start / I915_GTT_PAGE_SIZE; > + > + end = gte - vma->guard / I915_GTT_PAGE_SIZE; > + while (end < gte) > + gen8_set_pte(end++, vm->scratch[0]->encode); > + > end = gte + vma->node.size / I915_GTT_PAGE_SIZE; > > for_each_sgt_daddr(addr, iter, vma->pages) > @@ -245,6 +250,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, > GEM_BUG_ON(gte > end); > > /* Fill the allocated but "unused" space beyond the end of the buffer */ > + end += vma->guard / I915_GTT_PAGE_SIZE; > while (gte < end) > gen8_set_pte(gte++, vm->scratch[0]->encode); > > @@ -289,6 +295,11 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm, > > gte = (gen6_pte_t __iomem *)ggtt->gsm; > gte += vma->node.start / I915_GTT_PAGE_SIZE; > + > + end = gte - vma->guard / I915_GTT_PAGE_SIZE; > + while (end < gte) > + gen8_set_pte(end++, vm->scratch[0]->encode); > + > end = gte + vma->node.size / I915_GTT_PAGE_SIZE; > > for_each_sgt_daddr(addr, iter, vma->pages) > @@ -296,6 +307,7 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm, > GEM_BUG_ON(gte > end); > > /* Fill the allocated but "unused" space beyond the end of the buffer */ > + end += vma->guard / I915_GTT_PAGE_SIZE; > while (gte < end) > iowrite32(vm->scratch[0]->encode, gte++); > > @@ -311,27 +323,6 @@ static void nop_clear_range(struct i915_address_space *vm, > { > } > > -static void gen8_ggtt_clear_range(struct i915_address_space *vm, > - u64 start, u64 length) > -{ > - struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > - unsigned int first_entry = start / I915_GTT_PAGE_SIZE; > - unsigned int num_entries = length / I915_GTT_PAGE_SIZE; > - const gen8_pte_t scratch_pte = vm->scratch[0]->encode; > - gen8_pte_t __iomem *gtt_base = > - (gen8_pte_t __iomem *)ggtt->gsm + first_entry; > - const int max_entries = ggtt_total_entries(ggtt) - first_entry; > - int i; > - > - if (WARN(num_entries > max_entries, > - "First entry = %d; Num entries = %d (max=%d)\n", > - first_entry, num_entries, max_entries)) > - num_entries = max_entries; > - > - for (i = 0; i < num_entries; i++) > - gen8_set_pte(>t_base[i], scratch_pte); > -} > - > static void bxt_vtd_ggtt_wa(struct i915_address_space *vm) > { > /* > @@ -898,8 +889,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) > ggtt->vm.cleanup = gen6_gmch_remove; > ggtt->vm.insert_page = gen8_ggtt_insert_page; > ggtt->vm.clear_range = nop_clear_range; > - if (intel_scanout_needs_vtd_wa(i915)) > - ggtt->vm.clear_range = gen8_ggtt_clear_range; > > ggtt->vm.insert_entries = gen8_ggtt_insert_entries; > > @@ -1045,7 +1034,7 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt) > ggtt->vm.alloc_pt_dma = alloc_pt_dma; > > ggtt->vm.clear_range = nop_clear_range; > - if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915)) > + if (!HAS_FULL_PPGTT(i915)) > ggtt->vm.clear_range = gen6_ggtt_clear_range; > ggtt->vm.insert_page = gen6_ggtt_insert_page; > ggtt->vm.insert_entries = gen6_ggtt_insert_entries; > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h > index c9b0ee5e1d23..8a2dfc7144cf 100644 > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h > @@ -41,6 +41,7 @@ int i915_gem_gtt_insert(struct i915_address_space *vm, > #define PIN_HIGH BIT_ULL(5) > #define PIN_OFFSET_BIAS BIT_ULL(6) > #define PIN_OFFSET_FIXED BIT_ULL(7) > +#define PIN_VTD BIT_ULL(8) > > #define PIN_GLOBAL BIT_ULL(10) /* I915_VMA_GLOBAL_BIND */ > #define PIN_USER BIT_ULL(11) /* I915_VMA_LOCAL_BIND */ > diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c > index caa9b041616b..dccd36ff1a6d 100644 > --- a/drivers/gpu/drm/i915/i915_vma.c > +++ b/drivers/gpu/drm/i915/i915_vma.c > @@ -38,6 +38,8 @@ > #include "i915_trace.h" > #include "i915_vma.h" > > +#define VTD_GUARD roundup_pow_of_two(160 * SZ_4K) /* 160 PTE padding */ > + > static struct i915_global_vma { > struct i915_global base; > struct kmem_cache *slab_vmas; > @@ -552,6 +554,9 @@ bool i915_vma_misplaced(const struct i915_vma *vma, > vma->node.start != (flags & PIN_OFFSET_MASK)) > return true; > > + if (flags & PIN_VTD && vma->guard < VTD_GUARD) > + return true; > + > return false; > } > > @@ -637,6 +642,13 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) > alignment, vma->fence_alignment); > } > > + /* VT-d requires padding before/after the vma */ > + if (flags & PIN_VTD) { > + alignment = max_t(typeof(alignment), alignment, VTD_GUARD); > + vma->guard = alignment; > + size += 2 * vma->guard; > + } > + > GEM_BUG_ON(!IS_ALIGNED(size, I915_GTT_PAGE_SIZE)); > GEM_BUG_ON(!IS_ALIGNED(alignment, I915_GTT_MIN_ALIGNMENT)); > GEM_BUG_ON(!is_power_of_2(alignment)); someh> @@ -725,6 +737,11 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) > > list_add_tail(&vma->vm_link, &vma->vm->bound_list); > > + if (flags & PIN_VTD) { > + vma->node.start += vma->guard; Was a bit worried for a second that this might give the display a potentially misaligned vma start. But looks like you did consider all that: VTD_GUARD==POT, alignment + guard both get bumped to the max(). So AFAICS should guarantee everyone is happy. I guess we're now wasting a lot more ggtt address space though? Not sure if anyone has ever been at risk of running out though. And DPT should help with this on new platforms. > + vma->node.size -= 2 * vma->guard; > + } > + > return 0; > } > > @@ -734,6 +751,12 @@ i915_vma_detach(struct i915_vma *vma) > GEM_BUG_ON(!drm_mm_node_allocated(&vma->node)); > GEM_BUG_ON(i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND)); > > + if (vma->guard) { > + vma->node.start -= vma->guard; > + vma->node.size += 2 * vma->guard; > + vma->guard = 0; > + } > + > /* > * And finally now the object is completely decoupled from this > * vma, we can drop its hold on the backing storage and allow > diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h > index f5cb848b7a7e..bafec4e0b042 100644 > --- a/drivers/gpu/drm/i915/i915_vma_types.h > +++ b/drivers/gpu/drm/i915/i915_vma_types.h > @@ -190,6 +190,7 @@ struct i915_vma { > > u32 fence_size; > u32 fence_alignment; > + u32 guard; > > /** > * Count of the number of times this vma has been opened by different > -- > 2.20.1 -- Ville Syrjälä Intel _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx