On Tue, 2021-06-08 at 17:18 +0100, Matthew Auld wrote: > On Tue, 8 Jun 2021 at 10:29, Thomas Hellström > <thomas.hellstrom@xxxxxxxxxxxxxxx> wrote: > > > > From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > > > If we pipeline the PTE updates and then do the copy of those pages > > within a single unpreemptible command packet, we can submit the > > copies > > and leave them to be scheduled without having to synchronously wait > > under a global lock. In order to manage migration, we need to > > preallocate the page tables (and keep them pinned and available for > > use > > at any time), causing a bottleneck for migrations as all clients > > must > > contend on the limited resources. By inlining the ppGTT updates and > > performing the blit atomically, each client only owns the PTE while > > in > > use, and so we can reschedule individual operations however we see > > fit. > > And most importantly, we do not need to take a global lock on the > > shared > > vm, and wait until the operation is complete before releasing the > > lock > > for others to claim the PTE for themselves. > > > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > Co-developed-by: Thomas Hellström > > <thomas.hellstrom@xxxxxxxxxxxxxxx> > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > > --- > > drivers/gpu/drm/i915/Makefile | 1 + > > drivers/gpu/drm/i915/gt/intel_engine.h | 1 + > > drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 2 + > > drivers/gpu/drm/i915/gt/intel_migrate.c | 543 > > ++++++++++++++++++ > > drivers/gpu/drm/i915/gt/intel_migrate.h | 45 ++ > > drivers/gpu/drm/i915/gt/intel_migrate_types.h | 15 + > > drivers/gpu/drm/i915/gt/intel_ring.h | 1 + > > drivers/gpu/drm/i915/gt/selftest_migrate.c | 291 ++++++++++ > > .../drm/i915/selftests/i915_live_selftests.h | 1 + > > 9 files changed, 900 insertions(+) > > create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c > > create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h > > create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate_types.h > > create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c > > > > diff --git a/drivers/gpu/drm/i915/Makefile > > b/drivers/gpu/drm/i915/Makefile > > index ea8ee4b3e018..9f18902be626 100644 > > --- a/drivers/gpu/drm/i915/Makefile > > +++ b/drivers/gpu/drm/i915/Makefile > > @@ -109,6 +109,7 @@ gt-y += \ > > gt/intel_gtt.o \ > > gt/intel_llc.o \ > > gt/intel_lrc.o \ > > + gt/intel_migrate.o \ > > gt/intel_mocs.o \ > > gt/intel_ppgtt.o \ > > gt/intel_rc6.o \ > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h > > b/drivers/gpu/drm/i915/gt/intel_engine.h > > index 0862c42b4cac..949965680c37 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > > @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs > > *engine, int reg, u32 value) > > #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT * > > sizeof(u32)) > > #define I915_GEM_HWS_SEQNO 0x40 > > #define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO > > * sizeof(u32)) > > +#define I915_GEM_HWS_MIGRATE (0x42 * sizeof(u32)) > > #define I915_GEM_HWS_SCRATCH 0x80 > > > > #define I915_HWS_CSB_BUF0_INDEX 0x10 > > diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > > b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > > index 2694dbb9967e..1c3af0fc0456 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > > +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > > @@ -123,8 +123,10 @@ > > #define MI_SEMAPHORE_SAD_NEQ_SDD (5 << 12) > > #define MI_SEMAPHORE_TOKEN_MASK REG_GENMASK(9, 5) > > #define MI_SEMAPHORE_TOKEN_SHIFT 5 > > +#define MI_STORE_DATA_IMM MI_INSTR(0x20, 0) > > #define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1) > > #define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) > > +#define MI_STORE_QWORD_IMM_GEN8 (MI_INSTR(0x20, 3) | REG_BIT(21)) > > #define MI_MEM_VIRTUAL (1 << 22) /* 945,g33,965 */ > > #define MI_USE_GGTT (1 << 22) /* g4x+ */ > > #define MI_STORE_DWORD_INDEX MI_INSTR(0x21, 1) > > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c > > b/drivers/gpu/drm/i915/gt/intel_migrate.c > > new file mode 100644 > > index 000000000000..1f60f8ee36f8 > > --- /dev/null > > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c > > @@ -0,0 +1,543 @@ > > +// SPDX-License-Identifier: MIT > > +/* > > + * Copyright © 2020 Intel Corporation > > + */ > > + > > +#include "i915_drv.h" > > +#include "intel_context.h" > > +#include "intel_gpu_commands.h" > > +#include "intel_gt.h" > > +#include "intel_gtt.h" > > +#include "intel_migrate.h" > > +#include "intel_ring.h" > > + > > +struct insert_pte_data { > > + u64 offset; > > + bool is_lmem; > > +}; > > + > > +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */ > > + > > +static bool engine_supports_migration(struct intel_engine_cs > > *engine) > > +{ > > + if (!engine) > > + return false; > > + > > + /* > > + * We need the ability to prevent aribtration > > (MI_ARB_ON_OFF), > > + * the ability to write PTE using inline data > > (MI_STORE_DATA) > > + * and of course the ability to do the block transfer > > (blits). > > + */ > > + GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS); > > + > > + return true; > > +} > > + > > +static void insert_pte(struct i915_address_space *vm, > > + struct i915_page_table *pt, > > + void *data) > > +{ > > + struct insert_pte_data *d = data; > > + > > + vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, > > + d->is_lmem ? PTE_LM : 0); > > + d->offset += PAGE_SIZE; > > +} > > + > > +static struct i915_address_space *migrate_vm(struct intel_gt *gt) > > +{ > > + struct i915_vm_pt_stash stash = {}; > > + struct i915_ppgtt *vm; > > + int err; > > + int i; > > + > > + /* > > + * We construct a very special VM for use by all migration > > contexts, > > + * it is kept pinned so that it can be used at any time. As > > we need > > + * to pre-allocate the page directories for the migration > > VM, this > > + * limits us to only using a small number of prepared vma. > > + * > > + * To be able to pipeline and reschedule migration > > operations while > > + * avoiding unnecessary contention on the vm itself, the > > PTE updates > > + * are inline with the blits. All the blits use the same > > fixed > > + * addresses, with the backing store redirection being > > updated on the > > + * fly. Only 2 implicit vma are used for all migration > > operations. > > + * > > + * We lay the ppGTT out as: > > + * > > + * [0, CHUNK_SZ) -> first object > > + * [CHUNK_SZ, 2 * CHUNK_SZ) -> second object > > + * [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> > > PTE > > + * > > + * By exposing the dma addresses of the page directories > > themselves > > + * within the ppGTT, we are then able to rewrite the PTE > > prior to use. > > + * But the PTE update and subsequent migration operation > > must be atomic, > > + * i.e. within the same non-preemptible window so that we > > do not switch > > + * to another migration context that overwrites the PTE. > > + */ > > + > > + vm = i915_ppgtt_create(gt); > > + if (IS_ERR(vm)) > > + return ERR_CAST(vm); > > + > > + if (!vm->vm.allocate_va_range || !vm->vm.foreach) { > > + err = -ENODEV; > > + goto err_vm; > > + } > > + > > + /* > > + * Each engine instance is assigned its own chunk in the > > VM, so > > + * that we can run multiple instances concurrently > > + */ > > + for (i = 0; i < ARRAY_SIZE(gt- > > >engine_class[COPY_ENGINE_CLASS]); i++) { > > + struct intel_engine_cs *engine; > > + u64 base = (u64)i << 32; > > + struct insert_pte_data d = {}; > > + struct i915_gem_ww_ctx ww; > > + u64 sz; > > + > > + engine = gt->engine_class[COPY_ENGINE_CLASS][i]; > > + if (!engine_supports_migration(engine)) > > + continue; > > + > > + /* > > + * We copy in 8MiB chunks. Each PDE covers 2MiB, so > > we need > > + * 4x2 page directories for source/destination. > > + */ > > + sz = 2 * CHUNK_SZ; > > + d.offset = base + sz; > > + > > + /* > > + * We need another page directory setup so that we > > can write > > + * the 8x512 PTE in each chunk. > > + */ > > + sz += (sz >> 12) * sizeof(u64); > > + > > + err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz); > > + if (err) > > + goto err_vm; > > + > > + for_i915_gem_ww(&ww, err, true) { > > + err = i915_vm_lock_objects(&vm->vm, &ww); > > + if (err) > > + continue; > > + err = i915_vm_map_pt_stash(&vm->vm, > > &stash); > > + if (err) > > + continue; > > + > > + vm->vm.allocate_va_range(&vm->vm, &stash, > > base, base + sz); > > + } > > + i915_vm_free_pt_stash(&vm->vm, &stash); > > + if (err) > > + goto err_vm; > > + > > + /* Now allow the GPU to rewrite the PTE via its own > > ppGTT */ > > + d.is_lmem = i915_gem_object_is_lmem(vm- > > >vm.scratch[0]); > > + vm->vm.foreach(&vm->vm, base, base + sz, > > insert_pte, &d); > > Will this play nice with Xe HP where we have min page size > restrictions for the GTT + lmem? The page-table allocations sidestep > such restrictions since they were previously never inserted into the > GTT. Maybe we need the flat-ppGTT at some point? Perhaps add a TODO > or > something? I've previously discussed that with Chris, and apparently it should be as easy as adjusting the sz >> 12 above. But yeah, Chris also brought up the flat-ppGTT idea as something we should look into. Mieanwhile, we should add a TODO. /Thomas