Re: [PATCH v3 07/12] drm/sched: Prevent any job recoveries after device is unplugged.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:

On 11/22/20 6:57 AM, Christian König wrote:
Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
No point to try recovery if device is gone, it's meaningless.

I think that this should go into the device specific recovery function and not in the scheduler.


The timeout timer is rearmed here, so this prevents any new recovery work to restart from here after drm_dev_unplug was executed from amdgpu_pci_remove.It will not cover other places like job cleanup or starting new job but those should stop once the scheduler thread is stopped later.

Yeah, but this is rather unclean. We should probably return an error code instead if the timer should be rearmed or not.

Christian.


Andrey



Christian.


Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
  drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
  drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
  drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
  drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
  drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
  include/drm/gpu_scheduler.h               |  6 +++++-
  7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index d56f402..d0b0021 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
            r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
                     num_hw_submission, amdgpu_job_hang_limit,
-                   timeout, ring->name);
+                   timeout, ring->name, &adev->ddev);
          if (r) {
              DRM_ERROR("Failed to create scheduler on ring %s.\n",
                    ring->name);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index cd46c88..7678287 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
        ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
                   etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
-                 msecs_to_jiffies(500), dev_name(gpu->dev));
+                 msecs_to_jiffies(500), dev_name(gpu->dev),
+                 gpu->drm);
      if (ret)
          return ret;
  diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index dc6df9e..8a7e5d7ca 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
        return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
                    lima_job_hang_limit, msecs_to_jiffies(timeout),
-                  name);
+                  name,
+                  pipe->ldev->ddev);
  }
    void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 30e7b71..37b03b01 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
          ret = drm_sched_init(&js->queue[j].sched,
                       &panfrost_sched_ops,
                       1, 0, msecs_to_jiffies(500),
-                     "pan_js");
+                     "pan_js", pfdev->ddev);
          if (ret) {
              dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
              goto err_sched;
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c3f0bd0..95db8c6 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -53,6 +53,7 @@
  #include <drm/drm_print.h>
  #include <drm/gpu_scheduler.h>
  #include <drm/spsc_queue.h>
+#include <drm/drm_drv.h>
    #define CREATE_TRACE_POINTS
  #include "gpu_scheduler_trace.h"
@@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct work_struct *work)
      struct drm_gpu_scheduler *sched;
      struct drm_sched_job *job;
  +    int idx;
+
      sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
  +    if (!drm_dev_enter(sched->ddev, &idx)) {
+        DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
+             __func__, sched->name);
+        return;
+    }
+
      /* Protects against concurrent deletion in drm_sched_get_cleanup_job */
      spin_lock(&sched->job_list_lock);
      job = list_first_entry_or_null(&sched->ring_mirror_list,
@@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
      spin_lock(&sched->job_list_lock);
      drm_sched_start_timeout(sched);
      spin_unlock(&sched->job_list_lock);
+
+    drm_dev_exit(idx);
  }
     /**
@@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
             unsigned hw_submission,
             unsigned hang_limit,
             long timeout,
-           const char *name)
+           const char *name,
+           struct drm_device *ddev)
  {
      int i, ret;
      sched->ops = ops;
@@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
      sched->name = name;
      sched->timeout = timeout;
      sched->hang_limit = hang_limit;
+    sched->ddev = ddev;
      for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
          drm_sched_rq_init(sched, &sched->sched_rq[i]);
  diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 0747614..f5076e5 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
                   &v3d_bin_sched_ops,
                   hw_jobs_limit, job_hang_limit,
                   msecs_to_jiffies(hang_limit_ms),
-                 "v3d_bin");
+                 "v3d_bin",
+                 &v3d->drm);
      if (ret) {
          dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
          return ret;
@@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
                   &v3d_render_sched_ops,
                   hw_jobs_limit, job_hang_limit,
                   msecs_to_jiffies(hang_limit_ms),
-                 "v3d_render");
+                 "v3d_render",
+                 &v3d->drm);
      if (ret) {
          dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
              ret);
@@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
                   &v3d_tfu_sched_ops,
                   hw_jobs_limit, job_hang_limit,
                   msecs_to_jiffies(hang_limit_ms),
-                 "v3d_tfu");
+                 "v3d_tfu",
+                 &v3d->drm);
      if (ret) {
          dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
              ret);
@@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
                       &v3d_csd_sched_ops,
                       hw_jobs_limit, job_hang_limit,
                       msecs_to_jiffies(hang_limit_ms),
-                     "v3d_csd");
+                     "v3d_csd",
+                     &v3d->drm);
          if (ret) {
              dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
                  ret);
@@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
                       &v3d_cache_clean_sched_ops,
                       hw_jobs_limit, job_hang_limit,
                       msecs_to_jiffies(hang_limit_ms),
-                     "v3d_cache_clean");
+                     "v3d_cache_clean",
+                     &v3d->drm);
          if (ret) {
              dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
                  ret);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 9243655..a980709 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -32,6 +32,7 @@
    struct drm_gpu_scheduler;
  struct drm_sched_rq;
+struct drm_device;
    /* These are often used as an (initial) index
   * to an array, and as such should start at 0.
@@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
   * @score: score to help loadbalancer pick a idle sched
   * @ready: marks if the underlying HW is ready to work
   * @free_guilty: A hit to time out handler to free the guilty job.
+ * @ddev: Pointer to drm device of this scheduler.
   *
   * One scheduler is implemented for each hardware ring.
   */
@@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
      atomic_t                        score;
      bool                ready;
      bool                free_guilty;
+    struct drm_device        *ddev;
  };
    int drm_sched_init(struct drm_gpu_scheduler *sched,
             const struct drm_sched_backend_ops *ops,
             uint32_t hw_submission, unsigned hang_limit, long timeout,
-           const char *name);
+           const char *name,
+           struct drm_device *ddev);
    void drm_sched_fini(struct drm_gpu_scheduler *sched);
  int drm_sched_job_init(struct drm_sched_job *job,

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel




[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux