1,add new parm @job to indicate which ring hang. 2,return when calling to amdgpu_gpu_reset if under SRIOV case with this patch, we'll reset & recovery on the perticular ring instead of overkill all ring. Change-Id: I37dff5c16ef161e3a8425434516ebb285fbe6600 Signed-off-by: Monk Liu <Monk.Liu at amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0c51fb5..157d023 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2528,6 +2528,7 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev, * amdgpu_sriov_gpu_reset - reset the asic * * @adev: amdgpu device pointer + * @job: the job lead to hang * @voluntary: if this reset is requested by guest. * (true means by guest and false means by HYPERVISOR ) * @@ -2535,9 +2536,9 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev, * for SRIOV case. * Returns 0 for success or an error on failure. */ -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job, bool voluntary) { - int i, r = 0; + int i, j, r = 0; int resched; struct amdgpu_bo *bo, *tmp; struct amdgpu_ring *ring; @@ -2652,7 +2653,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) bool need_full_reset; if (amdgpu_sriov_vf(adev)) - return amdgpu_sriov_gpu_reset(adev, true); + return -EINVAL; if (!amdgpu_check_soft_reset(adev)) { DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 0209c96..2cc7c63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -36,7 +36,11 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job) job->base.sched->name, atomic_read(&job->ring->fence_drv.last_seq), job->ring->fence_drv.sync_seq); - amdgpu_gpu_reset(job->adev); + + if (amdgpu_sriov_vf(job->adev)) + amdgpu_sriov_gpu_reset(job->adev, job, true); + else + amdgpu_gpu_reset(job->adev); } int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index a8ed162..2b641eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -97,7 +97,7 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v); int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary); +int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job, bool voluntary); int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index e967a7b..4d71db3 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -243,7 +243,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery due to world switch failure */ - amdgpu_sriov_gpu_reset(adev, false); + amdgpu_sriov_gpu_reset(adev, NULL, false); } static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index f0d64f1..a604bd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -514,7 +514,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery due to world switch failure */ - amdgpu_sriov_gpu_reset(adev, false); + amdgpu_sriov_gpu_reset(adev, NULL, false); } static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, -- 2.7.4