On 01/02/2019 10:52, Chris Wilson wrote:
If we get caught in a kernel bug, we may never idle. Let the user regain
control of their system^Wterminal by responding to SIGINT!
v2: Push the signal checking into the loop around flush_work.
Reported-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
drivers/gpu/drm/i915/i915_debugfs.c | 14 ++++++++------
drivers/gpu/drm/i915/i915_gem.c | 10 +++++++---
drivers/gpu/drm/i915/i915_utils.h | 18 +++++++++++++++---
.../gpu/drm/i915/selftests/i915_gem_context.c | 5 ++++-
4 files changed, 34 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f3fa31d840f5..44fa34f4ebbc 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3945,12 +3945,14 @@ i915_drop_caches_set(void *data, u64 val)
i915_gem_shrink_all(i915);
fs_reclaim_release(GFP_KERNEL);
- if (val & DROP_IDLE) {
- do {
- if (READ_ONCE(i915->gt.active_requests))
- flush_delayed_work(&i915->gt.retire_work);
- drain_delayed_work(&i915->gt.idle_work);
- } while (READ_ONCE(i915->gt.awake));
+ if (val & DROP_IDLE && READ_ONCE(i915->gt.awake)) {
+ if (drain_delayed_work_state(&i915->gt.retire_work,
+ TASK_INTERRUPTIBLE) ||
+ drain_delayed_work_state(&i915->gt.idle_work,
+ TASK_INTERRUPTIBLE)) {
+ ret = -EINTR;
+ goto out;
+ }
Shrinker remains only killable so still not Ctrl-C in all cases here.
Don't know, not hot not cold. Will re-think it next week.
}
if (val & DROP_FREED)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4067eeaee78a..e47d53e9b634 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4499,13 +4499,17 @@ int i915_gem_suspend(struct drm_i915_private *i915)
mutex_unlock(&i915->drm.struct_mutex);
i915_reset_flush(i915);
- drain_delayed_work(&i915->gt.retire_work);
-
/*
* As the idle_work is rearming if it detects a race, play safe and
* repeat the flush until it is definitely idle.
*/
- drain_delayed_work(&i915->gt.idle_work);
+ if (drain_delayed_work_state(&i915->gt.retire_work,
+ TASK_INTERRUPTIBLE) ||
+ drain_delayed_work_state(&i915->gt.idle_work,
+ TASK_INTERRUPTIBLE)) {
+ ret = -EINTR;
+ goto err_unlock;
+ }
I'm no suspend expert but it sounds unexpected there would be signals
involved in the process. Does it have an userspace component? We don't
bother with interruptible mutex on this path either.
/*
* Assert that we successfully flushed all the work and
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index fcc751aa1ea8..6866b85e4239 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -25,6 +25,8 @@
#ifndef __I915_UTILS_H
#define __I915_UTILS_H
+#include <linux/sched/signal.h>
+
#undef WARN_ON
/* Many gcc seem to no see through this and fall over :( */
#if 0
@@ -148,12 +150,22 @@ static inline void __list_del_many(struct list_head *head,
* by requeueing itself. Note, that if the worker never cancels itself,
* we will spin forever.
*/
-static inline void drain_delayed_work(struct delayed_work *dw)
+
+static inline int drain_delayed_work_state(struct delayed_work *dw, int state)
{
do {
- while (flush_delayed_work(dw))
- ;
+ do {
+ if (signal_pending_state(state, current))
+ return -EINTR;
+ } while (flush_delayed_work(dw));
} while (delayed_work_pending(dw));
+
+ return 0;
+}
+
+static inline void drain_delayed_work(struct delayed_work *dw)
+{
+ drain_delayed_work_state(dw, 0);
0 would be TASK_RUNNING. signal_pending_state bails out in this case so
that's good.
}
static inline const char *yesno(bool v)
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 2b423128002c..a87998b90bf6 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -1686,8 +1686,11 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
/* XXX Bonus points for proving we are the kernel context! */
mutex_unlock(&i915->drm.struct_mutex);
- drain_delayed_work(&i915->gt.idle_work);
+ err = drain_delayed_work_state(&i915->gt.idle_work,
+ TASK_KILLABLE);
mutex_lock(&i915->drm.struct_mutex);
+ if (err)
+ return -EINTR;
}
if (igt_flush_test(i915, I915_WAIT_LOCKED))
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx