Re: [RFC 4/4] drm/i915/gt: Pipelined page migration

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Mon, 30 Nov 2020 13:12:55 +0000

On 28/11/2020 18:40, Chris Wilson wrote:
If we pipeline the PTE updates and then do the copy of those pages
within a single unpreemptible command packet, we can submit the copies
and leave them to be scheduled without having to synchronously wait
under a global lock.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  drivers/gpu/drm/i915/Makefile                 |   1 +
  drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
  drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
  drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
  drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
  .../drm/i915/selftests/i915_live_selftests.h  |   1 +
  6 files changed, 511 insertions(+)
  create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
  create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
  create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index e5574e506a5c..0b2e12c87f9d 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -103,6 +103,7 @@ gt-y += \
  	gt/intel_gtt.o \
  	gt/intel_llc.o \
  	gt/intel_lrc.o \
+	gt/intel_migrate.o \
  	gt/intel_mocs.o \
  	gt/intel_ppgtt.o \
  	gt/intel_rc6.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index ac58fcda4927..079d26b47a97 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
  #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
  #define I915_GEM_HWS_SEQNO		0x40
  #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
+#define I915_GEM_HWS_MIGRATE		(0x42 * sizeof(u32))
  #define I915_GEM_HWS_SCRATCH		0x80
  
  #define I915_HWS_CSB_BUF0_INDEX		0x10
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
new file mode 100644
index 000000000000..4d7bd32eb8d4
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gtt.h"
+#include "intel_lrc.h" /* virtual engine */
+#include "intel_migrate.h"
+#include "intel_ring.h"
+
+#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
+
+static void insert_pte(struct i915_address_space *vm,
+		       struct i915_page_table *pt,
+		       void *data)
+{
+	u64 *offset = data;
+
+	vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
+	*offset += PAGE_SIZE;
+}
+
+static struct i915_address_space *migrate_vm(struct intel_gt *gt)
+{
+	struct i915_vm_pt_stash stash = {};
+	struct i915_ppgtt *vm;
+	u64 offset, sz;
+	int err;
+
+	vm = i915_ppgtt_create(gt);
+	if (IS_ERR(vm))
+		return ERR_CAST(vm);
+
+	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
+		err = -ENODEV;
+		goto err_vm;
+	}
+
+	/*
+	 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
+	 * 4x2 page directories for source/destination.
+	 */
+	sz = 2 * CHUNK_SZ;
+	offset = sz;
+
+	/*
+	 * We need another page directory setup so that we can write
+	 * the 8x512 PTE in each chunk.
+	 */
+	sz += (sz >> 12) * sizeof(u64);
+
+	err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
+	if (err)
+		goto err_vm;
+
+	err = i915_vm_pin_pt_stash(&vm->vm, &stash);
+	if (err) {
+		i915_vm_free_pt_stash(&vm->vm, &stash);
+		goto err_vm;
+	}
+
+	vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
+	i915_vm_free_pt_stash(&vm->vm, &stash);
+
+	/* Now allow the GPU to rewrite the PTE via its own ppGTT */
+	vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);

This is just making the [0 - sz) gva point to the allocated sz bytes of 
backing store?

+
+	return &vm->vm;
+
+err_vm:
+	i915_vm_put(&vm->vm);
+	return ERR_PTR(err);
+}
+
+static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
+{
+	struct intel_engine_cs *engine;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
+		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
+		if (engine)
+			return engine;
+	}
+
+	return NULL;
+}
+
+static struct intel_context *pinned_context(struct intel_gt *gt)
+{
+	static struct lock_class_key key;
+	struct intel_engine_cs *engine;
+	struct i915_address_space *vm;
+	struct intel_context *ce;
+	int err;
+
+	engine = first_copy_engine(gt);
+	if (!engine)
+		return ERR_PTR(-ENODEV);
+
+	ce = intel_engine_create_pinned_context(engine, SZ_512K,
+						I915_GEM_HWS_MIGRATE,
+						&key, "migrate");
+	if (IS_ERR(ce))
+		return ce;
+
+	vm = migrate_vm(gt);
+	if (IS_ERR(vm)) {
+		err = PTR_ERR(vm);
+		goto err_ce;
+	}
+	i915_vm_put(ce->vm);
+	ce->vm = vm;
+
+	return ce;
+
+err_ce:
+	intel_context_put(ce);
+	return ERR_PTR(err);
+}
+
+int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
+{
+	struct intel_context *ce;
+
+	ce = pinned_context(gt);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
+
+	m->ce = ce;
+	return 0;
+}
+
+static struct intel_context *__migrate_engines(struct intel_gt *gt)
+{
+	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
+	struct intel_engine_cs *engine;
+	unsigned int count, i;
+
+	count = 0;
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
+		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
+		if (engine)
+			engines[count++] = engine;
+	}
+
+	return intel_execlists_create_virtual(engines, count);
+}
+
+struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
+{
+	struct intel_context *ce;
+
+	ce = __migrate_engines(m->ce->engine->gt);
+	if (IS_ERR(ce))
+		return ce;
+
+	ce->ring = __intel_context_ring_size(SZ_512K);
+
+	i915_vm_put(ce->vm);
+	ce->vm = i915_vm_get(m->ce->vm);
+
+	return ce;
+}
+
+static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
+{
+	dma_addr_t addr = sg_dma_address(sg);
+
+	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
+}
+
+static int emit_pte(struct i915_request *rq,
+		    struct sgt_dma *it,
+		    u64 encode,
+		    int offset,
+		    int length)
+{
+	int total = 0;
+
+	offset >>= 12;
+	offset *= sizeof(u64);
+	offset += 2 * CHUNK_SZ;
+
+	do {
+		u32 *cs;
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4;
+		*cs++ = offset;
+		*cs++ = 0;
+		*cs++ = lower_32_bits(encode | it->dma);
+		*cs++ = MI_STORE_DWORD_IMM_GEN4;
+		*cs++ = offset + 4;
+		*cs++ = 0;
+		*cs++ = upper_32_bits(encode | it->dma);
+		intel_ring_advance(rq, cs);
+
+		offset += 8;
+		total += I915_GTT_PAGE_SIZE;
+
+		it->dma += I915_GTT_PAGE_SIZE;
+		if (it->dma >= it->max) {
+			it->sg = __sg_next(it->sg);
+			if (!it->sg || sg_dma_len(it->sg) == 0)
+				break;
+
+			it->dma = sg_dma_address(it->sg);
+			it->max = it->dma + sg_dma_len(it->sg);
+		}
+
+		if (total == length)
+			break;
+	} while (1);
+
+	return total;
+}
+
+static bool wa_1209644611_applies(int gen, u32 size)
+{
+	u32 height = size >> PAGE_SHIFT;
+
+	if (gen != 11)
+		return false;
+
+	return height % 4 == 3 && height <= 8;
+}
+
+static int emit_copy(struct i915_request *rq, int size)
+{
+	const int gen = INTEL_GEN(rq->engine->i915);
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, gen >= 8 ? 10 : 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (gen >= 9 && !wa_1209644611_applies(gen, size)) {migrate_create_context(m);
+	if (IS_ERR(ce))
+		ce = intel_context_get(m->ce);
+	GEM_BUG_ON(IS_ERR(ce));
+
+	rq = intel_context_migrate_pages(ce, src, dst);
+
+		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
+		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+		*cs++ = 0;
+	} else if (gen >= 8) {
+		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+		*cs++ = 0;
+	} else {
+		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+	}
+
+	intel_ring_advance(rq, cs);
+	return 0;
+}
+
+struct i915_request *
+intel_context_migrate_pages(struct intel_context *ce,
+			    struct scatterlist *src,
+			    struct scatterlist *dst)
+{
+	struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
+	u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
+	struct i915_request *rq;
+	int len;
+	int err;
+
+	/* GEM_BUG_ON(ce->vm != migrate_vm); */
+
+	err = intel_context_pin(ce);
+	if (err)
+		return ERR_PTR(err);
+
+	GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+	do {
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto out_ce;
+		}
+
+		len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
+		if (len <= 0) {
+			err = len;
+			goto out_rq;
+		}
+
+		if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
+			err = -EINVAL;
+			goto out_rq;
+		}

Source and destination PTEs into the reserved [0, sz * 2) area?

+
+		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto out_rq;
+
+		err = emit_copy(rq, len);

Right so copy can use fixed offsets.

+		if (err)
+			goto out_rq;
+
+		if (!it_s.sg)
+			i915_request_get(rq);
+out_rq:
+		i915_request_add(rq);
+		if (it_s.sg)
+			cond_resched();

From what context does this run? No preemptible?

+	} while (err == 0 && it_s.sg);
+
+out_ce:
+	intel_context_unpin(ce);
+	return err ? ERR_PTR(err) : rq;
+}
+
+struct i915_request *
+intel_migrate_pages(struct intel_migrate *m,
+		    struct scatterlist *src,
+		    struct scatterlist *dst)
+{
+	struct intel_context *ce;
+	struct i915_request *rq;
+
+	if (!m->ce)
+		return ERR_PTR(-ENODEV);
+
+	ce = intel_migrate_create_context(m);
+	if (IS_ERR(ce))
+		ce = intel_context_get(m->ce);

If virtual cannot be create use a common pre-created context?

+	GEM_BUG_ON(IS_ERR(ce));
+
+	rq = intel_context_migrate_pages(ce, src, dst);
+
+	intel_context_put(ce);

Context is single use for some concrete reason? But it has fallback to a 
single context so not sure. Plan to allow using users context, or to 
inherit their priority so because of that?

...

So I guess overall this is an alternative to fixed vma windows but can 
be pipelined.

I did not get the foreach part. Why do you have to iterate existing 
entries to add entries? Can't you just populate the reserved range from 
the stashed bo dma addresses?

Regards,

Tvrtko

+	return rq;
+}
+
+void intel_migrate_fini(struct intel_migrate *m)
+{
+	if (!m->ce)
+		return;
+
+	intel_context_unpin(m->ce);
+	intel_context_put(m->ce);
+}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_migrate.c" 0, CHUNK_SZ)
+#endif
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.h b/drivers/gpu/drm/i915/gt/intel_migrate.h
new file mode 100644
index 000000000000..8c3c446fbd33
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#ifndef __INTEL_MIGRATE__
+#define __INTEL_MIGRATE__
+
+struct i915_request;
+struct intel_context;
+struct intel_gt;
+struct scatterlist;
+
+struct intel_migrate {
+	struct intel_context *ce;
+};
+
+int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt);
+
+struct i915_request *
+intel_migrate_pages(struct intel_migrate *m,
+		    struct scatterlist *src,
+		    struct scatterlist *dst);
+
+struct intel_context *intel_migrate_create_context(struct intel_migrate *m);
+struct i915_request *
+intel_context_migrate_pages(struct intel_context *ce,
+			    struct scatterlist *src,
+			    struct scatterlist *dst);
+
+void intel_migrate_fini(struct intel_migrate *m);
+
+#endif /* __INTEL_MIGRATE__ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c
new file mode 100644
index 000000000000..d5102058fe3b
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+static int live_migrate(void *arg)
+{
+	struct intel_migrate *m = arg;
+	struct drm_i915_private *i915 = m->ce->engine->i915;
+	const unsigned int sizes[] = {
+		SZ_4K,
+		SZ_64K,
+		SZ_2M,
+		SZ_64M,
+		//SZ_2G,
+	};
+	int err = 0;
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+		struct drm_i915_gem_object *src, *dst;
+		struct i915_request *rq;
+		u32 *vaddr;
+
+		src = i915_gem_object_create_internal(i915, sizes[i]);
+		if (IS_ERR(src))
+			break;
+
+		vaddr = i915_gem_object_pin_map(src, I915_MAP_WC);
+		if (IS_ERR(vaddr)) {
+			i915_gem_object_put(src);
+			break;
+		}
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++)
+			vaddr[j] = j;
+		i915_gem_object_flush_map(src);
+
+		dst = i915_gem_object_create_internal(i915, sizes[i]);
+		if (IS_ERR(dst)) {
+			i915_gem_object_put(dst);
+			break;
+		}
+
+		vaddr = i915_gem_object_pin_map(dst, I915_MAP_WC);
+		if (IS_ERR(vaddr)) {
+			i915_gem_object_put(dst);
+			i915_gem_object_put(src);
+			break;
+		}
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++)
+			vaddr[j] = ~j;
+		i915_gem_object_flush_map(dst);
+
+		rq = intel_migrate_pages(m,
+					 src->mm.pages->sgl,
+					 dst->mm.pages->sgl);
+		if (IS_ERR(rq)) {
+			pr_err("Migration failed, size: %u\n", sizes[i]);
+			err = PTR_ERR(rq);
+		}
+
+		if (i915_request_wait(rq, 0, HZ) < 0) {
+			pr_err("Migration timed out, size: %u\n", sizes[i]);
+			err = -ETIME;
+		}
+		i915_request_put(rq);
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++) {
+			if (vaddr[j] != j) {
+				pr_err("Copy failed, size: %u, offset: %zu\n",
+				       sizes[i], j * sizeof(u32));
+				igt_hexdump(vaddr + round_down(j, 1024), 4096);
+				err = -EINVAL;
+				break;
+			}
+		}
+
+		i915_gem_object_put(dst);
+		i915_gem_object_put(src);
+		i915_gem_drain_freed_objects(i915);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+int intel_migrate_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_migrate),
+	};
+	struct intel_migrate m;
+	int err;
+
+	if (intel_migrate_init(&m, &i915->gt))
+		return 0;
+
+	err = i915_subtests(tests, &m);
+	intel_migrate_fini(&m);
+
+	return err;
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index a92c0e9b7e6b..be5e0191eaea 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -26,6 +26,7 @@ selftest(gt_mocs, intel_mocs_live_selftests)
  selftest(gt_pm, intel_gt_pm_live_selftests)
  selftest(gt_heartbeat, intel_heartbeat_live_selftests)
  selftest(requests, i915_request_live_selftests)
+selftest(migrate, intel_migrate_live_selftests)
  selftest(active, i915_active_live_selftests)
  selftest(objects, i915_gem_object_live_selftests)
  selftest(mman, i915_gem_mman_live_selftests)

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx