RE: [PATCH] drm/amdgpu/mes: fix mes12 to map legacy queue

"Zhang, Hawking" <Hawking.Zhang@xxxxxxx> · Thu, 9 May 2024 07:20:58 +0000

[AMD Official Use Only - General]

Let's use dev_err that is more helpful in multiple-GPU use scenario when there are errors. Other than that, the patch is

Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>

Regards,
Hawking

-----Original Message-----
From: Xiao, Jack <Jack.Xiao@xxxxxxx>
Sent: Thursday, May 9, 2024 14:47
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Min, Frank <Frank.Min@xxxxxxx>; Gao, Likun <Likun.Gao@xxxxxxx>
Cc: Xiao, Jack <Jack.Xiao@xxxxxxx>
Subject: [PATCH] drm/amdgpu/mes: fix mes12 to map legacy queue

Adjust mes12 initialization sequence to fix mapping legacy queue.

Signed-off-by: Jack Xiao <Jack.Xiao@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 71 ++++++++++++++++---------  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c  | 10 ++--
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ca90d6b577c8..a2696c215899 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -599,6 +599,44 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev,
        return set_resource_bit;
 }

+static int amdgpu_gfx_mes_enable_kcq(struct amdgpu_device *adev, int
+xcc_id) {
+       struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
+       struct amdgpu_ring *kiq_ring = &kiq->ring;
+       uint64_t queue_mask = ~0ULL;
+       int r, i, j;
+
+       amdgpu_device_flush_hdp(adev, NULL);
+
+       if (!adev->enable_uni_mes) {
+               spin_lock(&kiq->ring_lock);
+               r = amdgpu_ring_alloc(kiq_ring, kiq->pmf->set_resources_size);
+               if (r) {
+                       DRM_ERROR("Failed to lock KIQ (%d).\n", r);
+                       spin_unlock(&kiq->ring_lock);
+                       return r;
+               }
+
+               kiq->pmf->kiq_set_resources(kiq_ring, queue_mask);
+               r = amdgpu_ring_test_helper(kiq_ring);
+               spin_unlock(&kiq->ring_lock);
+               if (r)
+                       DRM_ERROR("KIQ failed to set resources\n");
+       }
+
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               j = i + xcc_id * adev->gfx.num_compute_rings;
+               r = amdgpu_mes_map_legacy_queue(adev,
+                                               &adev->gfx.compute_ring[j]);
+               if (r) {
+                       DRM_ERROR("failed to map compute queue\n");
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
 int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)  {
        struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; @@ -606,6 +644,9 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
        uint64_t queue_mask = 0;
        int r, i, j;

+       if (adev->enable_mes)
+               return amdgpu_gfx_mes_enable_kcq(adev, xcc_id);
+
        if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources)
                return -EINVAL;

@@ -626,9 +667,6 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)

        amdgpu_device_flush_hdp(adev, NULL);

-       if (adev->enable_mes)
-               queue_mask = ~0ULL;
-
        DRM_INFO("kiq ring mec %d pipe %d q %d\n", kiq_ring->me, kiq_ring->pipe,
                 kiq_ring->queue);

@@ -643,13 +681,10 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
        }

        kiq->pmf->kiq_set_resources(kiq_ring, queue_mask);
-
-       if (!adev->enable_mes) {
-               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-                       j = i + xcc_id * adev->gfx.num_compute_rings;
-                       kiq->pmf->kiq_map_queues(kiq_ring,
-                                                &adev->gfx.compute_ring[j]);
-               }
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               j = i + xcc_id * adev->gfx.num_compute_rings;
+               kiq->pmf->kiq_map_queues(kiq_ring,
+                                        &adev->gfx.compute_ring[j]);
        }

        r = amdgpu_ring_test_helper(kiq_ring);
@@ -657,20 +692,6 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
        if (r)
                DRM_ERROR("KCQ enable failed\n");

-       if (adev->enable_mes || adev->enable_uni_mes) {
-               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-                       j = i + xcc_id * adev->gfx.num_compute_rings;
-                       r = amdgpu_mes_map_legacy_queue(adev,
-                                              &adev->gfx.compute_ring[j]);
-                       if (r) {
-                               DRM_ERROR("failed to map compute queue\n");
-                               return r;
-                       }
-               }
-
-               return 0;
-       }
-
        return r;
 }

@@ -685,7 +706,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)

        amdgpu_device_flush_hdp(adev, NULL);

-       if (adev->enable_mes || adev->enable_uni_mes) {
+       if (adev->enable_mes) {
                for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
                        j = i + xcc_id * adev->gfx.num_gfx_rings;
                        r = amdgpu_mes_map_legacy_queue(adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 76db85157bf9..5519655fd70a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -1357,6 +1357,10 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev)
        if (r)
                goto failure;

+       r = mes_v12_0_hw_init(adev);
+       if (r)
+               goto failure;
+
        return r;

 failure:
@@ -1381,7 +1385,7 @@ static int mes_v12_0_hw_init(void *handle)
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

        if (adev->mes.ring.sched.ready)
-               return 0;
+               goto out;

        if (!adev->enable_mes_kiq || adev->enable_uni_mes) {
                if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { @@ -1425,6 +1429,7 @@ static int mes_v12_0_hw_init(void *handle)
                goto failure;
        }

+out:
        /*
         * Disable KIQ ring usage from the driver once MES is enabled.
         * MES uses KIQ ring exclusively so driver cannot access KIQ ring @@ -1498,8 +1503,7 @@ static int mes_v12_0_late_init(void *handle)
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

        /* it's only intended for use in mes_self_test case, not for s0ix and reset */
-       if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend &&
-           !adev->enable_uni_mes)
+       if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend)
                amdgpu_mes_self_test(adev);

        return 0;
--
2.41.0