[PATCH 07/15] drm/i915: Defer default hardware context initialisation until first open

Dave Gordon <david.s.gordon@xxxxxxxxx> · Mon, 15 Jun 2015 19:36:25 +0100

In order to fully initialise the default contexts, we have to execute
batchbuffer commands on the GPU engines. But in the case of GuC-based
batch submission, we can't do that until any required firmware has
been loaded, which may not be possible during driver load, because the
filesystem(s) containing the firmware may not be mounted until later.

Therefore, we now allow the first call to the firmware-loading code to
return -EAGAIN to indicate that it's not yet ready, and that it should
be retried when the device is first opened from user code, by which
time we expect that all required filesystems will have been mounted.
The late-retry code will then re-attempt to load the firmware if the
early attempt failed.

If the late retry fails, the current open-in-progress will fail, but
the recovery code will disable GuC submission and reset the GPU and
driver. The next open will therefore be in non-GuC mode, and will be
allowed to complete even if the GuC cannot be loaded or used.

Issue: VIZ-4884
Signed-off-by: Dave Gordon <david.s.gordon@xxxxxxxxx>
Signed-off-by: Alex Dai <yu.dai@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.h         |    2 ++
 drivers/gpu/drm/i915/i915_gem.c         |    9 +++++-
 drivers/gpu/drm/i915/i915_gem_context.c |   52 ++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_irq.c         |   48 ++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f47cde7..a1fc278 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1837,6 +1837,7 @@ struct drm_i915_private {
 	/* hda/i915 audio component */
 	bool audio_component_registered;
 
+	bool contexts_ready;
 	uint32_t hw_context_size;
 	struct list_head context_list;
 
@@ -2614,6 +2615,7 @@ void i915_queue_hangcheck(struct drm_device *dev);
 __printf(3, 4)
 void i915_handle_error(struct drm_device *dev, bool wedged,
 		       const char *fmt, ...);
+void i915_handle_guc_error(struct drm_device *dev, int err);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 extern void intel_hpd_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cd4a865..d1a8862 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5025,8 +5025,15 @@ i915_gem_init_hw(struct drm_device *dev)
 
 	/* We can't enable contexts until all firmware is loaded */
 	ret = intel_guc_ucode_load(dev, false);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto out;		/* too early */
+	}
+
 	ret = i915_gem_context_enable(dev_priv);
-	if (ret && ret != -EIO) {
+	if (ret == 0) {
+		dev_priv->contexts_ready = true;
+	} else if (ret && ret != -EIO) {
 		DRM_ERROR("Context enable failed %d\n", ret);
 		i915_gem_cleanup_ringbuffer(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 133afcf..debbfc9 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -447,23 +447,65 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	return 0;
 }
 
+/* Complete any late initialisation here */
+static int i915_gem_context_first_open(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	/*
+	 * We can't enable contexts until all firmware is loaded. This
+	 * call shouldn't return -EAGAIN because we pass wait=true, but
+	 * it can still fail with code -EIO if the GuC doesn't respond,
+	 * or -ENOEXEC if the GuC firmware image is invalid.
+	 */
+	ret = intel_guc_ucode_load(dev, true);
+	WARN_ON(ret == -EAGAIN);
+
+	/*
+	 * If an error occurred and GuC submission has been requested, we can
+	 * attempt recovery by disabling GuC submission and reinitialising
+	 * the GPU and driver. We then fail this open() anyway, but the next
+	 * attempt will find that GuC submission is already disabled, and so
+	 * proceed to complete context initialisation in non-GuC mode instead.
+	 */
+	if (ret && i915.enable_guc_submission) {
+		i915_handle_guc_error(dev, ret);
+		return ret;
+	}
+
+	ret = i915_gem_context_enable(dev_priv);
+	if (ret == 0)
+		dev_priv->contexts_ready = true;
+	return ret;
+}
+
 int i915_gem_context_open(struct drm_device *dev, struct drm_file *file)
 {
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct intel_context *ctx;
+	int ret = 0;
 
 	idr_init(&file_priv->context_idr);
 
 	mutex_lock(&dev->struct_mutex);
-	ctx = i915_gem_create_context(dev, file_priv);
+
+	if (!dev_priv->contexts_ready)
+		ret = i915_gem_context_first_open(dev);
+
+	if (ret == 0) {
+		ctx = i915_gem_create_context(dev, file_priv);
+		if (IS_ERR(ctx))
+			ret = PTR_ERR(ctx);
+	}
+
 	mutex_unlock(&dev->struct_mutex);
 
-	if (IS_ERR(ctx)) {
+	if (ret)
 		idr_destroy(&file_priv->context_idr);
-		return PTR_ERR(ctx);
-	}
 
-	return 0;
+	return ret;
 }
 
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 56db9e74..f7dcf8d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2665,6 +2665,54 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
 	i915_reset_and_wakeup(dev);
 }
 
+/**
+ * i915_handle_error - handle a GuC error
+ * @dev: drm device
+ *
+ * If the GuC can't be (re-)initialised, disable GuC submission and
+ * then reset and reinitialise the rest of the GPU, so that we can
+ * fall back to operating in ELSP mode. Don't bother capturing error
+ * state, because it probably isn't relevant here.
+ *
+ * Unlike i915_handle_error() above, this is called with the global
+ * struct_mutex held, so we need to release it after setting the
+ * reset-in-progress bit so that other threads can make progress,
+ * and reacquire it after the reset is complete.
+ */
+void i915_handle_guc_error(struct drm_device *dev, int err)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	DRM_ERROR("GuC failure %d, disabling GuC submission\n", err);
+	i915.enable_guc_submission = false;
+
+	i915_report_and_clear_eir(dev);	/* unlikely? */
+
+	atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
+			&dev_priv->gpu_error.reset_counter);
+
+	mutex_unlock(&dev->struct_mutex);
+
+	/*
+	 * Wakeup waiting processes so that the reset function
+	 * i915_reset_and_wakeup doesn't deadlock trying to grab
+	 * various locks. By bumping the reset counter first, the woken
+	 * processes will see a reset in progress and back off,
+	 * releasing their locks and then wait for the reset completion.
+	 * We must do this for _all_ gpu waiters that might hold locks
+	 * that the reset work needs to acquire.
+	 *
+	 * Note: The wake_up serves as the required memory barrier to
+	 * ensure that the waiters see the updated value of the reset
+	 * counter atomic_t.
+	 */
+	i915_error_wake_up(dev_priv, false);
+
+	i915_reset_and_wakeup(dev);
+
+	mutex_lock(&dev->struct_mutex);
+}
+
 /* Called from drm generic code, passed 'crtc' which
  * we use as a pipe index
  */
-- 
1.7.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx