Re: [PATCH v9 23/23] drm/i915/vm_bind: Support capture of persistent mappings

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Dec 13, 2022 at 12:03:07PM +0000, Matthew Auld wrote:
On 12/12/2022 23:15, Niranjana Vishwanathapura wrote:
Support dump capture of persistent mappings upon user request.

Capture of a mapping is requested with the VM_BIND ioctl and
processed during the GPU error handling, thus not adding any
additional latency to the submission path.

A list of persistent vmas requiring capture is maintained
instead of a list of vma resources. This allows for no
additional handling around eviction.

v2: enable with CONFIG_DRM_I915_CAPTURE_ERROR, remove gfp
    overwrite, add kernel-doc and expand commit message

Signed-off-by: Brian Welty <brian.welty@xxxxxxxxx>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@xxxxxxxxx>
---
 .../gpu/drm/i915/gem/i915_gem_vm_bind_object.c | 13 +++++++++++++
 drivers/gpu/drm/i915/gt/intel_gtt.c            |  5 +++++
 drivers/gpu/drm/i915/gt/intel_gtt.h            |  7 +++++++
 drivers/gpu/drm/i915/i915_gpu_error.c          | 18 +++++++++++++++++-
 drivers/gpu/drm/i915/i915_vma.c                |  4 ++++
 drivers/gpu/drm/i915/i915_vma_types.h          |  4 ++++
 include/uapi/drm/i915_drm.h                    |  9 +++++++--
 7 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
index 78e7c0642c5f..562a67a988f2 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c
@@ -88,6 +88,12 @@ static void i915_gem_vm_bind_remove(struct i915_vma *vma, bool release_obj)
 {
 	lockdep_assert_held(&vma->vm->vm_bind_lock);
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+	mutex_lock(&vma->vm->vm_capture_lock);
+	if (!list_empty(&vma->vm_capture_link))
+		list_del_init(&vma->vm_capture_link);
+	mutex_unlock(&vma->vm->vm_capture_lock);
+#endif
 	spin_lock(&vma->vm->vm_rebind_lock);
 	if (!list_empty(&vma->vm_rebind_link))
 		list_del_init(&vma->vm_rebind_link);
@@ -357,6 +363,13 @@ static int i915_gem_vm_bind_obj(struct i915_address_space *vm,
 				continue;
 		}
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+		if (va->flags & I915_GEM_VM_BIND_CAPTURE) {
+			mutex_lock(&vm->vm_capture_lock);
+			list_add_tail(&vma->vm_capture_link, &vm->vm_capture_list);
+			mutex_unlock(&vm->vm_capture_lock);
+		}
+#endif
 		list_add_tail(&vma->vm_bind_link, &vm->vm_bound_list);
 		i915_vm_bind_it_insert(vma, &vm->va);
 		if (!obj->priv_root)
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 2e4c9fabf3b8..103ca55222be 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -297,6 +297,11 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass)
 	spin_lock_init(&vm->vm_rebind_lock);
 	spin_lock_init(&vm->userptr_invalidated_lock);
 	INIT_LIST_HEAD(&vm->userptr_invalidated_list);
+
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+	INIT_LIST_HEAD(&vm->vm_capture_list);
+	mutex_init(&vm->vm_capture_lock);
+#endif
 }
 void *__px_vaddr(struct drm_i915_gem_object *p)
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 620b4e020a9f..7f69e1d4fb5e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -281,6 +281,13 @@ struct i915_address_space {
 	/** @root_obj: root object for dma-resv sharing by private objects */
 	struct drm_i915_gem_object *root_obj;
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+	/* @vm_capture_list: list of vm captures */
+	struct list_head vm_capture_list;
+	/* @vm_capture_lock: protects vm_capture_list */
+	struct mutex vm_capture_lock;
+#endif
+
 	/* Global GTT */
 	bool is_ggtt:1;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9d5d5a397b64..76b2834ce958 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1460,6 +1460,22 @@ capture_vma(struct intel_engine_capture_vma *next,
 	return next;
 }
+static struct intel_engine_capture_vma *
+capture_user_vm(struct intel_engine_capture_vma *capture,
+		struct i915_address_space *vm, gfp_t gfp)
+{
+	struct i915_vma *vma;
+
+	mutex_lock(&vm->vm_capture_lock);
+	/* vma->resource must be valid here as persistent vmas are bound */
+	list_for_each_entry(vma, &vm->vm_capture_list, vm_capture_link)
+		capture = capture_vma_snapshot(capture, vma->resource,

Thinking some more on this, I don't think we can actually do this. The vma->resource at this stage could be complete nonsense (could even be NULL?), if you consider piplelined migrations. For example if we async evict something, the object state can be freely updated (maybe even more than once), even though the dma-resv is still active with fences. This is allowed since the actual move(s) will be pipelined and should respect those fences. In eb2 this is solved by holding the object lock and taking a snaphot of the vma at submit time, which should ensure we are capturing the correct vma->resource and sg_table. Maybe I'm missing something.


Thanks Matt.
I discussed it offline with Thomas. He suggested that probably we can
do synchronous unbind for mappings with capture request so that the
vma->resource won't be removed under the hood while a request is active.
I will update the series with the fix.

Thanks,
Niranjana

+					       gfp, "user");
+	mutex_unlock(&vm->vm_capture_lock);
+
+	return capture;
+}
+
 static struct intel_engine_capture_vma *
 capture_user(struct intel_engine_capture_vma *capture,
 	     const struct i915_request *rq,
@@ -1471,7 +1487,7 @@ capture_user(struct intel_engine_capture_vma *capture,
 		capture = capture_vma_snapshot(capture, c->vma_res, gfp,
 					       "user");
-	return capture;
+	return capture_user_vm(capture, rq->context->vm, gfp);
 }
 static void add_vma(struct intel_engine_coredump *ee,
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d092a86123ae..9be8aa448874 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -248,6 +248,10 @@ vma_create(struct drm_i915_gem_object *obj,
 	INIT_LIST_HEAD(&vma->non_priv_vm_bind_link);
 	INIT_LIST_HEAD(&vma->vm_rebind_link);
 	INIT_LIST_HEAD(&vma->userptr_invalidated_link);
+
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+	INIT_LIST_HEAD(&vma->vm_capture_link);
+#endif
 	return vma;
 err_unlock:
diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h
index 89f9854a6f69..c4fd61d51ce6 100644
--- a/drivers/gpu/drm/i915/i915_vma_types.h
+++ b/drivers/gpu/drm/i915/i915_vma_types.h
@@ -310,6 +310,10 @@ struct i915_vma {
 	struct list_head vm_rebind_link; /* Link in vm_rebind_list */
 	/** @userptr_invalidated_link: link to the vm->userptr_invalidated_list */
 	struct list_head userptr_invalidated_link;
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+	/* @vm_capture_link: link to the captureable VMA list */
+	struct list_head vm_capture_link;
+#endif
 	/** Timeline fence for vm_bind completion notification */
 	struct {
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index b9167f950327..5fde6020e339 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3925,12 +3925,17 @@ struct drm_i915_gem_vm_bind {
 	__u64 length;
 	/**
-	 * @flags: Currently reserved, MBZ.
+	 * @flags: Supported flags are:
+	 *
+	 * I915_GEM_VM_BIND_CAPTURE:
+	 * Capture this mapping in the dump upon GPU error.
+	 * CONFIG_DRM_I915_CAPTURE_ERROR should be enabled for valid capture.
 	 *
 	 * Note that @fence carries its own flags.
 	 */
 	__u64 flags;
-#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (~0ull)
+#define I915_GEM_VM_BIND_CAPTURE           (1ull << 0)
+#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS   (-(I915_GEM_VM_BIND_CAPTURE << 1))
 	/** @rsvd: Reserved, MBZ */
 	__u64 rsvd[2];



[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux