Re: [RFC 4/4] drm/i915/gt: Pipelined page migration

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Mon, 30 Nov 2020 16:07:44 +0000

On 30/11/2020 13:39, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
On 28/11/2020 18:40, Chris Wilson wrote:
If we pipeline the PTE updates and then do the copy of those pages
within a single unpreemptible command packet, we can submit the copies
and leave them to be scheduled without having to synchronously wait
under a global lock.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
   drivers/gpu/drm/i915/Makefile                 |   1 +
   drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
   drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
   drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
   drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
   6 files changed, 511 insertions(+)
   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
   create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index e5574e506a5c..0b2e12c87f9d 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -103,6 +103,7 @@ gt-y += \
       gt/intel_gtt.o \
       gt/intel_llc.o \
       gt/intel_lrc.o \
+     gt/intel_migrate.o \
       gt/intel_mocs.o \
       gt/intel_ppgtt.o \
       gt/intel_rc6.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index ac58fcda4927..079d26b47a97 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
   #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
   #define I915_GEM_HWS_SEQNO          0x40
   #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
+#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
   #define I915_GEM_HWS_SCRATCH                0x80
   
   #define I915_HWS_CSB_BUF0_INDEX             0x10
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
new file mode 100644
index 000000000000..4d7bd32eb8d4
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gtt.h"
+#include "intel_lrc.h" /* virtual engine */
+#include "intel_migrate.h"
+#include "intel_ring.h"
+
+#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
+
+static void insert_pte(struct i915_address_space *vm,
+                    struct i915_page_table *pt,
+                    void *data)
+{
+     u64 *offset = data;
+
+     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
+     *offset += PAGE_SIZE;
+}
+
+static struct i915_address_space *migrate_vm(struct intel_gt *gt)
+{
+     struct i915_vm_pt_stash stash = {};
+     struct i915_ppgtt *vm;
+     u64 offset, sz;
+     int err;
+
+     vm = i915_ppgtt_create(gt);
+     if (IS_ERR(vm))
+             return ERR_CAST(vm);
+
+     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
+             err = -ENODEV;
+             goto err_vm;
+     }
+
+     /*
+      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
+      * 4x2 page directories for source/destination.
+      */
+     sz = 2 * CHUNK_SZ;
+     offset = sz;
+
+     /*
+      * We need another page directory setup so that we can write
+      * the 8x512 PTE in each chunk.
+      */
+     sz += (sz >> 12) * sizeof(u64);
+
+     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
+     if (err)
+             goto err_vm;
+
+     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
+     if (err) {
+             i915_vm_free_pt_stash(&vm->vm, &stash);
+             goto err_vm;
+     }
+
+     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
+     i915_vm_free_pt_stash(&vm->vm, &stash);
+
+     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
+     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);

This is just making the [0 - sz) gva point to the allocated sz bytes of
backing store?

Not quite, we are pointing [offset, sz) to the page directories
themselves. When we write into that range, we are modifying the PTE of
this ppGTT. (Breaking the fourth wall, allowing the non-privileged
context to update its own page tables.)

Confusing, so first step is here making [0, sz) gva ptes point to pte 
backing store.

Which means emit_pte when called from intel_context_migrate_pages is 
emitting sdw which will write into [0, sz] gva, effectively placing the 
src and dest object content at those location.

Then blit copies the data.

What happens if context is re-used? Gva's no longer point to PTEs so 
how? Is a step to re-initialize after use missing? More sdw after the 
blit copy to make it point back to ptes?


This vm could be engine->vm, but I don't think we actually save anything
by keeping them independent, and we gain the safety of not futzing with
the kernel_context.

Remind me that kernel_vm can share their scratches, even when read-only
is not supported.


+
+     return &vm->vm;
+
+err_vm:
+     i915_vm_put(&vm->vm);
+     return ERR_PTR(err);
+}
+
+struct i915_request *
+intel_context_migrate_pages(struct intel_context *ce,
+                         struct scatterlist *src,
+                         struct scatterlist *dst)
+{
+     struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
+     u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
+     struct i915_request *rq;
+     int len;
+     int err;
+
+     /* GEM_BUG_ON(ce->vm != migrate_vm); */
+
+     err = intel_context_pin(ce);
+     if (err)
+             return ERR_PTR(err);
+
+     GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+     do {
+             rq = i915_request_create(ce);
+             if (IS_ERR(rq)) {
+                     err = PTR_ERR(rq);
+                     goto out_ce;
+             }
+
+             len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
+             if (len <= 0) {
+                     err = len;
+                     goto out_rq;
+             }
+
+             if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
+                     err = -EINVAL;
+                     goto out_rq;
+             }

Source and destination PTEs into the reserved [0, sz * 2) area?

Yes.


+
+             err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+             if (err)
+                     goto out_rq;
+
+             err = emit_copy(rq, len);

Right so copy can use fixed offsets.

+             if (err)
+                     goto out_rq;
+
+             if (!it_s.sg)
+                     i915_request_get(rq);
+out_rq:
+             i915_request_add(rq);
+             if (it_s.sg)
+                     cond_resched();

  From what context does this run? No preemptible?

Has to be process context; numerous allocations, implicit waits (that we
want to avoid in practice), and the timeline (per-context) mutex to
guard access to the ringbuffer.

I guess on non-preemptible, or not fully preemptible kernel it is useful.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx