[AMD Official Use Only - General] Let's use dev_err that is more helpful in multiple-GPU use scenario when there are errors. Other than that, the patch is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Xiao, Jack <Jack.Xiao@xxxxxxx> Sent: Thursday, May 9, 2024 14:47 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Min, Frank <Frank.Min@xxxxxxx>; Gao, Likun <Likun.Gao@xxxxxxx> Cc: Xiao, Jack <Jack.Xiao@xxxxxxx> Subject: [PATCH] drm/amdgpu/mes: fix mes12 to map legacy queue Adjust mes12 initialization sequence to fix mapping legacy queue. Signed-off-by: Jack Xiao <Jack.Xiao@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 71 ++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 10 ++-- 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index ca90d6b577c8..a2696c215899 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -599,6 +599,44 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev, return set_resource_bit; } +static int amdgpu_gfx_mes_enable_kcq(struct amdgpu_device *adev, int +xcc_id) { + struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; + struct amdgpu_ring *kiq_ring = &kiq->ring; + uint64_t queue_mask = ~0ULL; + int r, i, j; + + amdgpu_device_flush_hdp(adev, NULL); + + if (!adev->enable_uni_mes) { + spin_lock(&kiq->ring_lock); + r = amdgpu_ring_alloc(kiq_ring, kiq->pmf->set_resources_size); + if (r) { + DRM_ERROR("Failed to lock KIQ (%d).\n", r); + spin_unlock(&kiq->ring_lock); + return r; + } + + kiq->pmf->kiq_set_resources(kiq_ring, queue_mask); + r = amdgpu_ring_test_helper(kiq_ring); + spin_unlock(&kiq->ring_lock); + if (r) + DRM_ERROR("KIQ failed to set resources\n"); + } + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + j = i + xcc_id * adev->gfx.num_compute_rings; + r = amdgpu_mes_map_legacy_queue(adev, + &adev->gfx.compute_ring[j]); + if (r) { + DRM_ERROR("failed to map compute queue\n"); + return r; + } + } + + return 0; +} + int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; @@ -606,6 +644,9 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) uint64_t queue_mask = 0; int r, i, j; + if (adev->enable_mes) + return amdgpu_gfx_mes_enable_kcq(adev, xcc_id); + if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources) return -EINVAL; @@ -626,9 +667,6 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) amdgpu_device_flush_hdp(adev, NULL); - if (adev->enable_mes) - queue_mask = ~0ULL; - DRM_INFO("kiq ring mec %d pipe %d q %d\n", kiq_ring->me, kiq_ring->pipe, kiq_ring->queue); @@ -643,13 +681,10 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) } kiq->pmf->kiq_set_resources(kiq_ring, queue_mask); - - if (!adev->enable_mes) { - for (i = 0; i < adev->gfx.num_compute_rings; i++) { - j = i + xcc_id * adev->gfx.num_compute_rings; - kiq->pmf->kiq_map_queues(kiq_ring, - &adev->gfx.compute_ring[j]); - } + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + j = i + xcc_id * adev->gfx.num_compute_rings; + kiq->pmf->kiq_map_queues(kiq_ring, + &adev->gfx.compute_ring[j]); } r = amdgpu_ring_test_helper(kiq_ring); @@ -657,20 +692,6 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) if (r) DRM_ERROR("KCQ enable failed\n"); - if (adev->enable_mes || adev->enable_uni_mes) { - for (i = 0; i < adev->gfx.num_compute_rings; i++) { - j = i + xcc_id * adev->gfx.num_compute_rings; - r = amdgpu_mes_map_legacy_queue(adev, - &adev->gfx.compute_ring[j]); - if (r) { - DRM_ERROR("failed to map compute queue\n"); - return r; - } - } - - return 0; - } - return r; } @@ -685,7 +706,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id) amdgpu_device_flush_hdp(adev, NULL); - if (adev->enable_mes || adev->enable_uni_mes) { + if (adev->enable_mes) { for (i = 0; i < adev->gfx.num_gfx_rings; i++) { j = i + xcc_id * adev->gfx.num_gfx_rings; r = amdgpu_mes_map_legacy_queue(adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 76db85157bf9..5519655fd70a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1357,6 +1357,10 @@ static int mes_v12_0_kiq_hw_init(struct amdgpu_device *adev) if (r) goto failure; + r = mes_v12_0_hw_init(adev); + if (r) + goto failure; + return r; failure: @@ -1381,7 +1385,7 @@ static int mes_v12_0_hw_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; if (adev->mes.ring.sched.ready) - return 0; + goto out; if (!adev->enable_mes_kiq || adev->enable_uni_mes) { if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) { @@ -1425,6 +1429,7 @@ static int mes_v12_0_hw_init(void *handle) goto failure; } +out: /* * Disable KIQ ring usage from the driver once MES is enabled. * MES uses KIQ ring exclusively so driver cannot access KIQ ring @@ -1498,8 +1503,7 @@ static int mes_v12_0_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && - !adev->enable_uni_mes) + if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend) amdgpu_mes_self_test(adev); return 0; -- 2.41.0