[PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When run as a timer, i915_hangcheck_elapsed() must adhere to all the
rules of running in a softirq context. This is advantageous to us as we
want to minimise the risk that a driver bug will prevent us from
detecting a hung GPU. However, that is irrelevant if the driver bug
prevents us from resetting and recovering. Still it is prudent not to
rely on mutexes inside the checker, but given the coarseness of
dev->struct_mutex doing so is extremely hard.

Give in and run from a work queue, i.e. outside of softirq.

v2:

The conversion does have one significant change, from the use of
mod_timer to schedule_delayed_work, means that the time that we execute
the first hangcheck is fixed and not continually deferred by later work.
This has the advantage of not allowing userspace to fill the ring before
hangcheck can finally run. At the same time, it removes the ability for
the interrupt to defer the hangcheck as well. This is sensible for that
an interrupt is only for a single engine, whereas we perform hangcheck
globally, so whilst one ring may have hung, the other could be running
normally and preventing the hangcheck from firing.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Jani Nikula <jani.nikula@xxxxxxxxx>
Cc: Daniel Vetter <dnaiel.vetter@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_dma.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.h |  2 +-
 drivers/gpu/drm/i915/i915_gem.c |  2 +-
 drivers/gpu/drm/i915/i915_irq.c | 16 ++++++++--------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6502ae2d7b7d..6a8e71cb2be8 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
 	}
 
 	/* Free error state after interrupts are fully disabled. */
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f7cc6a9c14fd..ea9224a977c1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
 		return ret;
 	}
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	intel_uncore_forcewake_reset(dev, false);
 	dev_priv->pm.suspended = true;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a920ca3789d8..e1f8ffcb2cf3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1209,7 +1209,7 @@ struct i915_gpu_error {
 	/* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
-	struct timer_list hangcheck_timer;
+	struct delayed_work hangcheck_work;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b37177afc3c0..3e80c777bf12 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
 							     DRIVER_MODESET);
 	mutex_unlock(&dev->struct_mutex);
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	flush_delayed_work(&dev_priv->mm.idle_work);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2ade9efe078c..d295f546b58d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1274,7 +1274,6 @@ static void notify_ring(struct drm_device *dev,
 	atomic_inc(&engine->interrupts);
 
 	wake_up_all(&engine->irq_queue);
-	i915_queue_hangcheck(dev);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -3189,9 +3188,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  * we kick the ring. If we see no progress on three subsequent calls
  * we assume chip is wedged and try to fix it by resetting the chip.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
+static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv),
+			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	int i;
 	int busy_count = 0, rings_hung = 0;
@@ -3312,8 +3313,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
 	if (!i915_module.enable_hangcheck)
 		return;
 
-	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
+			      round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
 }
 
 static void ibx_irq_reset(struct drm_device *dev)
@@ -4615,9 +4616,8 @@ void intel_irq_init(struct drm_device *dev)
 	else
 		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
 
-	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-		    i915_hangcheck_elapsed,
-		    (unsigned long) dev_priv);
+	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
+			  i915_hangcheck_elapsed);
 	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
 			  intel_hpd_irq_reenable);
 
-- 
2.1.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux