The MES firmware expects synchronous operation with the driver. For this to work asynchronously, each caller would need to provide its own fence location and sequence number. For now, add a mutex lock to serialize the MES submission. For SR-IOV long-wait case, break the long-wait to separated part to prevent this wait from impacting reset sequence. Signed-off-by: Horace Chen <horace.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 18 ++++++++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 78e4f88f5134..8896be95b2c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev) spin_lock_init(&adev->mes.queue_id_lock); spin_lock_init(&adev->mes.ring_lock); mutex_init(&adev->mes.mutex_hidden); + mutex_init(&adev->mes.submission_lock); adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK; adev->mes.vmid_mask_mmhub = 0xffffff00; @@ -221,6 +222,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev) idr_destroy(&adev->mes.queue_id_idr); ida_destroy(&adev->mes.doorbell_ida); mutex_destroy(&adev->mes.mutex_hidden); + mutex_destroy(&adev->mes.submission_lock); return r; } @@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) idr_destroy(&adev->mes.queue_id_idr); ida_destroy(&adev->mes.doorbell_ida); mutex_destroy(&adev->mes.mutex_hidden); + mutex_destroy(&adev->mes.submission_lock); } static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 6b3e1844eac5..90af935cc889 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -85,6 +85,7 @@ struct amdgpu_mes { struct amdgpu_ring ring; spinlock_t ring_lock; + struct mutex submission_lock; const struct firmware *fw[AMDGPU_MAX_MES_PIPES]; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index e40d00afd4f5..0a609a5b8835 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, struct amdgpu_ring *ring = &mes->ring; unsigned long flags; signed long timeout = adev->usec_timeout; + signed long retry_count = 1; const char *op_str, *misc_op_str; if (x_pkt->header.opcode >= MES_SCH_API_MAX) @@ -169,15 +170,19 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, if (amdgpu_emu_mode) { timeout *= 100; - } else if (amdgpu_sriov_vf(adev)) { + } + + if (amdgpu_sriov_vf(adev) && timeout > 0) { /* Worst case in sriov where all other 15 VF timeout, each VF needs about 600ms */ - timeout = 15 * 600 * 1000; + retry_count = (15 * 600 * 1000) / timeout; } BUG_ON(size % 4 != 0); + mutex_lock(&mes->submission_lock); spin_lock_irqsave(&mes->ring_lock, flags); if (amdgpu_ring_alloc(ring, ndw)) { spin_unlock_irqrestore(&mes->ring_lock, flags); + mutex_unlock(&mes->submission_lock); return -ENOMEM; } @@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes, else dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); - r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, - timeout); + do { + r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, + timeout); + retry_count--; + } while (retry_count > 0 && !amdgpu_in_reset(adev)); + + mutex_unlock(&mes->submission_lock); if (r < 1) { if (misc_op_str) -- 2.34.1