Re: [PATCH] drm/i915: Avoid uninterruptible spinning in debugfs

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Fri, 1 Feb 2019 17:29:45 +0000

On 01/02/2019 10:52, Chris Wilson wrote:
If we get caught in a kernel bug, we may never idle. Let the user regain
control of their system^Wterminal by responding to SIGINT!

v2: Push the signal checking into the loop around flush_work.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_debugfs.c            | 14 ++++++++------
  drivers/gpu/drm/i915/i915_gem.c                | 10 +++++++---
  drivers/gpu/drm/i915/i915_utils.h              | 18 +++++++++++++++---
  .../gpu/drm/i915/selftests/i915_gem_context.c  |  5 ++++-
  4 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f3fa31d840f5..44fa34f4ebbc 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3945,12 +3945,14 @@ i915_drop_caches_set(void *data, u64 val)
  		i915_gem_shrink_all(i915);
  	fs_reclaim_release(GFP_KERNEL);
  
-	if (val & DROP_IDLE) {
-		do {
-			if (READ_ONCE(i915->gt.active_requests))
-				flush_delayed_work(&i915->gt.retire_work);
-			drain_delayed_work(&i915->gt.idle_work);
-		} while (READ_ONCE(i915->gt.awake));
+	if (val & DROP_IDLE && READ_ONCE(i915->gt.awake)) {
+		if (drain_delayed_work_state(&i915->gt.retire_work,
+					     TASK_INTERRUPTIBLE) ||
+		    drain_delayed_work_state(&i915->gt.idle_work,
+					     TASK_INTERRUPTIBLE)) {
+			ret = -EINTR;
+			goto out;
+		}

Shrinker remains only killable so still not Ctrl-C in all cases here. 
Don't know, not hot not cold. Will re-think it next week.

  	}
  
  	if (val & DROP_FREED)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4067eeaee78a..e47d53e9b634 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4499,13 +4499,17 @@ int i915_gem_suspend(struct drm_i915_private *i915)
  	mutex_unlock(&i915->drm.struct_mutex);
  	i915_reset_flush(i915);
  
-	drain_delayed_work(&i915->gt.retire_work);
-
  	/*
  	 * As the idle_work is rearming if it detects a race, play safe and
  	 * repeat the flush until it is definitely idle.
  	 */
-	drain_delayed_work(&i915->gt.idle_work);
+	if (drain_delayed_work_state(&i915->gt.retire_work,
+				     TASK_INTERRUPTIBLE) ||
+	    drain_delayed_work_state(&i915->gt.idle_work,
+				     TASK_INTERRUPTIBLE)) {
+		ret = -EINTR;
+		goto err_unlock;
+	}

I'm no suspend expert but it sounds unexpected there would be signals 
involved in the process. Does it have an userspace component? We don't 
bother with interruptible mutex on this path either.

  
  	/*
  	 * Assert that we successfully flushed all the work and
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index fcc751aa1ea8..6866b85e4239 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -25,6 +25,8 @@
  #ifndef __I915_UTILS_H
  #define __I915_UTILS_H
  
+#include <linux/sched/signal.h>
+
  #undef WARN_ON
  /* Many gcc seem to no see through this and fall over :( */
  #if 0
@@ -148,12 +150,22 @@ static inline void __list_del_many(struct list_head *head,
   * by requeueing itself. Note, that if the worker never cancels itself,
   * we will spin forever.
   */
-static inline void drain_delayed_work(struct delayed_work *dw)
+
+static inline int drain_delayed_work_state(struct delayed_work *dw, int state)
  {
  	do {
-		while (flush_delayed_work(dw))
-			;
+		do {
+			if (signal_pending_state(state, current))
+				return -EINTR;
+		} while (flush_delayed_work(dw));
  	} while (delayed_work_pending(dw));
+
+	return 0;
+}
+
+static inline void drain_delayed_work(struct delayed_work *dw)
+{
+	drain_delayed_work_state(dw, 0);

0 would be TASK_RUNNING. signal_pending_state bails out in this case so 
that's good.

  }
  
  static inline const char *yesno(bool v)
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 2b423128002c..a87998b90bf6 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -1686,8 +1686,11 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
  		/* XXX Bonus points for proving we are the kernel context! */
  
  		mutex_unlock(&i915->drm.struct_mutex);
-		drain_delayed_work(&i915->gt.idle_work);
+		err = drain_delayed_work_state(&i915->gt.idle_work,
+					       TASK_KILLABLE);
  		mutex_lock(&i915->drm.struct_mutex);
+		if (err)
+			return -EINTR;
  	}
  
  	if (igt_flush_test(i915, I915_WAIT_LOCKED))


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx