Re: [PATCH 2/3] drm/i915: Add control flags to i915_handle_error()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 16/03/18 14:50, Chris Wilson wrote:
Not all callers want the GPU error to handled in the same way, so expose
a control parameter. In the first instance, some callers do not want the
heavyweight error capture so add a bit to request the state to be
captured and saved.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Jeff McGee <jeff.mcgee@xxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_debugfs.c              |  4 ++--
  drivers/gpu/drm/i915/i915_drv.h                  |  4 +++-
  drivers/gpu/drm/i915/i915_irq.c                  | 22 ++++++++++++++--------
  drivers/gpu/drm/i915/intel_hangcheck.c           |  6 +++---
  drivers/gpu/drm/i915/selftests/intel_hangcheck.c |  2 +-
  5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 5378863e3238..03b74a92caed 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3952,8 +3952,8 @@ i915_wedged_set(void *data, u64 val)
  		engine->hangcheck.stalled = true;
  	}
- i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
-			  val);
+	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
+			  "Manually set wedged engine mask = %llx", val);
wait_on_bit(&i915->gpu_error.flags,
  		    I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e27ba8fb64e6..53009ba50640 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2751,10 +2751,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
  			   &dev_priv->gpu_error.hangcheck_work, delay);
  }
-__printf(3, 4)
+__printf(4, 5)
  void i915_handle_error(struct drm_i915_private *dev_priv,
  		       u32 engine_mask,
+		       unsigned long flags,
  		       const char *fmt, ...);
+#define I915_ERROR_CAPTURE BIT(0)
Should this be in the new i915_gpu_error.h?

  extern void intel_irq_init(struct drm_i915_private *dev_priv);
  extern void intel_irq_fini(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 44eef355e12c..b3a4dc7cb26c 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2955,6 +2955,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
   * i915_handle_error - handle a gpu error
   * @dev_priv: i915 device private
   * @engine_mask: mask representing engines that are hung
+ * @flags: control flags
   * @fmt: Error message format string
   *
   * Do some basic checking of register state at error time and
@@ -2965,16 +2966,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
   */
  void i915_handle_error(struct drm_i915_private *dev_priv,
  		       u32 engine_mask,
+		       unsigned long flags,
  		       const char *fmt, ...)
  {
  	struct intel_engine_cs *engine;
  	unsigned int tmp;
-	va_list args;
-	char error_msg[80];
-
-	va_start(args, fmt);
-	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
-	va_end(args);
/*
  	 * In most cases it's guaranteed that we get here with an RPM
@@ -2986,8 +2982,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
  	intel_runtime_pm_get(dev_priv);
engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
-	i915_capture_error_state(dev_priv, engine_mask, error_msg);
-	i915_clear_error_registers(dev_priv);
+
+	if (flags & I915_ERROR_CAPTURE) {
+		char error_msg[80];
+		va_list args;
+
+		va_start(args, fmt);
+		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+		va_end(args);
+
+		i915_capture_error_state(dev_priv, engine_mask, error_msg);
+		i915_clear_error_registers(dev_priv);
+	}
        else
	    DRM_INFO or DRM_NOTE the error_msg  ?

Otherwise the 'kicking wait/semaphore' text from below will never be printed.

/*
  	 * Try engine reset when available. We fall back to full reset if
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 42e45ae87393..13d1a269c771 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  	 */
  	tmp = I915_READ_CTL(engine);
  	if (tmp & RING_WAIT) {
-		i915_handle_error(dev_priv, 0,
+		i915_handle_error(dev_priv, 0, 0,
  				  "Kicking stuck wait on %s",
  				  engine->name);
  		I915_WRITE_CTL(engine, tmp);
@@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  		default:
  			return ENGINE_DEAD;
  		case 1:
-			i915_handle_error(dev_priv, 0,
+			i915_handle_error(dev_priv, 0, 0,
  					  "Kicking stuck semaphore on %s",
  					  engine->name);
  			I915_WRITE_CTL(engine, tmp);
@@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
  				 "%s, ", engine->name);
  	msg[len-2] = '\0';
- return i915_handle_error(i915, hung, "%s", msg);
+	return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
  }
/*
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index df7898c8edcb..84aad2b4825d 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1084,7 +1084,7 @@ static int igt_handle_error(void *arg)
  	engine->hangcheck.stalled = true;
  	engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+	i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__);
xchg(&i915->gpu_error.first_error, error);
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux