Re: [RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2021-12-20 2:16 a.m., Christian König wrote:


Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:
Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
  3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5f13195d23d1..b595e6d699b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
      return r;
  }
  +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+    long timeout;
+    int r, i;
+
+    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+        struct amdgpu_ring *ring = adev->rings[i];
+
+        /* No need to setup the GPU scheduler for rings that don't need it */
+        if (!ring || ring->no_scheduler)
+            continue;
+
+        switch (ring->funcs->type) {
+        case AMDGPU_RING_TYPE_GFX:
+            timeout = adev->gfx_timeout;
+            break;
+        case AMDGPU_RING_TYPE_COMPUTE:
+            timeout = adev->compute_timeout;
+            break;
+        case AMDGPU_RING_TYPE_SDMA:
+            timeout = adev->sdma_timeout;
+            break;
+        default:
+            timeout = adev->video_timeout;
+            break;
+        }
+


+        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+                   ring->num_hw_submission, amdgpu_job_hang_limit,
+                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
+        if (r) {
+            DRM_ERROR("Failed to create scheduler on ring %s.\n",
+                  ring->name);
+            return r;
+        }

Maybe better put that into amdgpu_ring.c. But not really a hard requirement, more a gut feeling.

+    }
+
+    return 0;
+}
+
+
  /**
   * amdgpu_device_ip_init - run init for hardware IPs
   *
@@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
          }
      }
  +    r = amdgpu_device_init_schedulers(adev);
+    if (r)
+        goto init_failed;
+
      /* Don't init kfd if whole hive need to be reset during init */
      if (!adev->gmc.xgmi.pending_reset)
          amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
                    atomic_t *sched_score)
  {
      struct amdgpu_device *adev = ring->adev;
-    long timeout;
-    int r;
        if (!adev)
          return -EINVAL;
@@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
      spin_lock_init(&ring->fence_drv.lock);
      ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
                       GFP_KERNEL);
-    if (!ring->fence_drv.fences)
-        return -ENOMEM;
  -    /* No need to setup the GPU scheduler for rings that don't need it */
-    if (ring->no_scheduler)
-        return 0;
+    ring->num_hw_submission = num_hw_submission;
+    ring->sched_score = sched_score;

Probably better to set that in the caller and drop the parameters from the amdgpu_fence_driver_init_ring() function completely.

Christian.


I noticed that at least num_hw_submission is validated within the function so not sure we should then discard the parameters.

Andrey



  -    switch (ring->funcs->type) {
-    case AMDGPU_RING_TYPE_GFX:
-        timeout = adev->gfx_timeout;
-        break;
-    case AMDGPU_RING_TYPE_COMPUTE:
-        timeout = adev->compute_timeout;
-        break;
-    case AMDGPU_RING_TYPE_SDMA:
-        timeout = adev->sdma_timeout;
-        break;
-    default:
-        timeout = adev->video_timeout;
-        break;
-    }
-
-    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
-               num_hw_submission, amdgpu_job_hang_limit,
-               timeout, NULL, sched_score, ring->name);
-    if (r) {
-        DRM_ERROR("Failed to create scheduler on ring %s.\n",
-              ring->name);
-        return r;
-    }
+    if (!ring->fence_drv.fences)
+        return -ENOMEM;
        return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 4d380e79752c..a4b8279e3011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,6 +253,8 @@ struct amdgpu_ring {
      bool            has_compute_vm_bug;
      bool            no_scheduler;
      int            hw_prio;
+    unsigned         num_hw_submission;
+    atomic_t        *sched_score;
  };
    #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux