Re: [PATCH 05/21] drm/i915: Separate GPU hang waitqueue from advance

Arun Siluvery <arun.siluvery@xxxxxxxxxxxxxxx> · Tue, 7 Jun 2016 17:41:20 +0530

On 06/06/2016 18:30, Tvrtko Ursulin wrote:

On 03/06/16 17:08, Chris Wilson wrote:
Currently __i915_wait_request uses a per-engine wait_queue_t for the dual
purpose of waking after the GPU advances or for waking after an error.
In the future, we may add even more wake sources and require greater
separation, but for now we can conceptually simplify wakeups by
separating
the two sources. In particular, this allows us to use different
wait-queues
(e.g. one on the engine advancement, a global one for errors and one on
each requests) without any hassle.

+ Arun

I think this will conflict with the TDR work where one of the features
is to make reset handling per engine. So I am not sure how beneficial in
general, or painful for the TDR series, this patch might be.

Thanks Tvrtko.
Chris has give some comments on a related tdr patch based on these 
changes. I am looking into how to update my changes based on this.

Tdr code need access to struct_mutex so if a waiter is holding it then 
we should be able to ask it to try again so that we can proceed with 
recovery, similarly when an engine reset is in progress.

regards
Arun


Regards,

Tvrtko

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  drivers/gpu/drm/i915/i915_drv.h |  6 ++++++
  drivers/gpu/drm/i915/i915_gem.c |  5 +++++
  drivers/gpu/drm/i915/i915_irq.c | 19 ++++---------------
  3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h
b/drivers/gpu/drm/i915/i915_drv.h
index ceccc6d6b119..e399e97965e0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1401,6 +1401,12 @@ struct i915_gpu_error {
  #define I915_WEDGED            (1 << 31)

      /**
+     * Waitqueue to signal when a hang is detected. Used to for waiters
+     * to release the struct_mutex for the reset to procede.
+     */
+    wait_queue_head_t wait_queue;
+
+    /**
       * Waitqueue to signal when the reset has completed. Used by
clients
       * that wait for dev_priv->mm.wedged to settle.
       */
diff --git a/drivers/gpu/drm/i915/i915_gem.c
b/drivers/gpu/drm/i915/i915_gem.c
index 03256f096ab6..de4fb39312a4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1234,6 +1234,7 @@ int __i915_wait_request(struct
drm_i915_gem_request *req,
      const bool irq_test_in_progress =
          ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) &
intel_engine_flag(engine);
      int state = interruptible ? TASK_INTERRUPTIBLE :
TASK_UNINTERRUPTIBLE;
+    DEFINE_WAIT(reset);
      DEFINE_WAIT(wait);
      unsigned long timeout_expire;
      s64 before = 0; /* Only to silence a compiler warning. */
@@ -1278,6 +1279,7 @@ int __i915_wait_request(struct
drm_i915_gem_request *req,
          goto out;
      }

+    add_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
      for (;;) {
          struct timer_list timer;

@@ -1329,6 +1331,8 @@ int __i915_wait_request(struct
drm_i915_gem_request *req,
              destroy_timer_on_stack(&timer);
          }
      }
+    remove_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
+
      if (!irq_test_in_progress)
          engine->irq_put(engine);

@@ -5026,6 +5030,7 @@ i915_gem_load_init(struct drm_device *dev)
                i915_gem_retire_work_handler);
      INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
                i915_gem_idle_work_handler);
+    init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
      init_waitqueue_head(&dev_priv->gpu_error.reset_queue);

      dev_priv->relative_constants_mode =
I915_EXEC_CONSTANTS_REL_GENERAL;
diff --git a/drivers/gpu/drm/i915/i915_irq.c
b/drivers/gpu/drm/i915/i915_irq.c
index 83cab14639b2..30127b94f26e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2488,11 +2488,8 @@ static irqreturn_t gen8_irq_handler(int irq,
void *arg)
      return ret;
  }

-static void i915_error_wake_up(struct drm_i915_private *dev_priv,
-                   bool reset_completed)
+static void i915_error_wake_up(struct drm_i915_private *dev_priv)
  {
-    struct intel_engine_cs *engine;
-
      /*
       * Notify all waiters for GPU completion events that reset state
has
       * been changed, and that they need to restart their wait after
@@ -2501,18 +2498,10 @@ static void i915_error_wake_up(struct
drm_i915_private *dev_priv,
       */

      /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-    for_each_engine(engine, dev_priv)
-        wake_up_all(&engine->irq_queue);
+    wake_up_all(&dev_priv->gpu_error.wait_queue);

      /* Wake up intel_crtc_wait_for_pending_flips, holding
crtc->mutex. */
      wake_up_all(&dev_priv->pending_flip_queue);
-
-    /*
-     * Signal tasks blocked in i915_gem_wait_for_error that the pending
-     * reset state is cleared.
-     */
-    if (reset_completed)
-        wake_up_all(&dev_priv->gpu_error.reset_queue);
  }

  /**
@@ -2577,7 +2566,7 @@ static void i915_reset_and_wakeup(struct
drm_i915_private *dev_priv)
           * Note: The wake_up also serves as a memory barrier so that
           * waiters see the update value of the reset counter atomic_t.
           */
-        i915_error_wake_up(dev_priv, true);
+        wake_up_all(&dev_priv->gpu_error.reset_queue);
      }
  }

@@ -2713,7 +2702,7 @@ void i915_handle_error(struct drm_i915_private
*dev_priv,
           * ensure that the waiters see the updated value of the reset
           * counter atomic_t.
           */
-        i915_error_wake_up(dev_priv, false);
+        i915_error_wake_up(dev_priv);
      }

      i915_reset_and_wakeup(dev_priv);



_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx