During GPU reset, driver should hold on all external access to GPU, otherwise psp will randomly fail to do post, and then cause system hang. Signed-off-by: Dennis Li <dennis.li@xxxxxxx> Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 6c7dd0a707c9..34bfc2a147ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -965,7 +965,7 @@ struct amdgpu_device { bool in_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutex notifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index ad59ac4423b8..4139c81389a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, /* This works for NO_HWS. TODO: need to handle without knowing VMID */ job->vmid = vmid; + down_read(&adev->reset_sem); ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); + up_read(&adev->reset_sem); if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + down_read(&adev->reset_sem); + if (adev->family == AMDGPU_FAMILY_AI) { int i; @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); } + up_read(&adev->reset_sem); + return 0; } @@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)kgd; const uint32_t flush_type = 0; bool all_hub = false; + int ret = 0; if (adev->family == AMDGPU_FAMILY_AI) all_hub = true; - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + down_read(&adev->reset_sem); + + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + + up_read(&adev->reset_sem); + + return ret; } bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcd..db5d533dd406 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long end_jiffies; uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); + int ret = 0; if (adev->in_gpu_reset) return -EIO; @@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; #endif + down_read(&adev->reset_sem); + acquire_queue(kgd, pipe_id, queue_id); if (m->cp_hqd_vmid == 0) @@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; + ret = -ETIME; + goto pro_end; } usleep_range(500, 1000); } +pro_end: release_queue(kgd); - return 0; + up_read(&adev->reset_sem); + return ret; } static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e78748540..cf27fe5091aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, enum hqd_dequeue_request_type type; unsigned long flags, end_jiffies; int retry; + int ret = 0; if (adev->in_gpu_reset) return -EIO; + down_read(&adev->reset_sem); + acquire_queue(kgd, pipe_id, queue_id); WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); @@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out\n"); - release_queue(kgd); - return -ETIME; + ret = -ETIME; + goto pro_end; } usleep_range(500, 1000); } +pro_end: release_queue(kgd); - return 0; + up_read(&adev->reset_sem); + return ret; } static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b5..bc8e07186a85 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -420,10 +420,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; struct vi_mqd *m = get_mqd(mqd); + int ret = 0; if (adev->in_gpu_reset) return -EIO; + down_read(&adev->reset_sem); + acquire_queue(kgd, pipe_id, queue_id); if (m->cp_hqd_vmid == 0) @@ -504,14 +507,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; + ret = -ETIME; + goto pro_end; } usleep_range(500, 1000); } +pro_end: release_queue(kgd); - return 0; + up_read(&adev->reset_sem); + return ret; } static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index df841c2ac5e7..341ad652d910 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -540,10 +540,13 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long end_jiffies; uint32_t temp; struct v9_mqd *m = get_mqd(mqd); + int ret = 0; if (adev->in_gpu_reset) return -EIO; + down_read(&adev->reset_sem); + acquire_queue(kgd, pipe_id, queue_id); if (m->cp_hqd_vmid == 0) @@ -570,14 +573,16 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, break; if (time_after(jiffies, end_jiffies)) { pr_err("cp queue preemption time out.\n"); - release_queue(kgd); - return -ETIME; + ret = -ETIME; + goto pro_end; } usleep_range(500, 1000); } +pro_end: release_queue(kgd); - return 0; + up_read(&adev->reset_sem); + return ret; } static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index aeada7c9fbea..d8f264c47b86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); if (adev->autodump.dumping.done) { reinit_completion(&adev->autodump.dumping); ret = 0; } else { ret = -EBUSY; } - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); return ret; } @@ -1188,7 +1188,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1215,7 +1215,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1395,7 +1395,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1436,7 +1436,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2858c09fd8c0..bc902c59c1c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3044,7 +3044,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - mutex_init(&adev->lock_reset); + init_rwsem(&adev->reset_sem); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4150,10 +4150,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) { if (trylock) { - if (!mutex_trylock(&adev->lock_reset)) + if (!down_write_trylock(&adev->reset_sem)) return false; } else - mutex_lock(&adev->lock_reset); + down_write(&adev->reset_sem); atomic_inc(&adev->gpu_reset_counter); adev->in_gpu_reset = true; @@ -4177,7 +4177,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); + up_write(&adev->reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 2975c4a6e581..d4c69f9577a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -225,8 +225,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) if (finished->error < 0) { DRM_INFO("Skip scheduling IBs!\n"); } else { + down_read(&ring->adev->reset_sem); r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, &fence); + up_read(&ring->adev->reset_sem); if (r) DRM_ERROR("Error scheduling IBs (%d)\n", r); } @@ -237,6 +239,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) amdgpu_job_free_resources(job); fence = r ? ERR_PTR(r) : fence; + return fence; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 13ea8ebc421c..38a751f7d889 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -121,6 +121,7 @@ void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) void amdgpu_ring_commit(struct amdgpu_ring *ring) { uint32_t count; + struct amdgpu_device *adev = ring->adev; /* We pad to match fetch size */ count = ring->funcs->align_mask + 1 - diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 5fd67e1cc2a0..97f33540aa02 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -244,11 +244,11 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" + * we can unlock the reset_sem to allow "amdgpu_job_timedout" * to run gpu_recover() after FLR_NOTIFICATION_CMPL received * which means host side had finished this VF's FLR. */ - locked = mutex_trylock(&adev->lock_reset); + locked = down_write_trylock(&adev->reset_sem); if (locked) adev->in_gpu_reset = true; @@ -263,7 +263,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) flr_done: if (locked) { adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); + up_write(&adev->reset_sem); } /* Trigger recovery for world switch failure if no TDR */ diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index ce2bf1fb79ed..484d61c22396 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -265,11 +265,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" + * we can unlock the reset_sem to allow "amdgpu_job_timedout" * to run gpu_recover() after FLR_NOTIFICATION_CMPL received * which means host side had finished this VF's FLR. */ - locked = mutex_trylock(&adev->lock_reset); + locked = down_write_trylock(&adev->reset_sem); if (locked) adev->in_gpu_reset = true; @@ -284,7 +284,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) flr_done: if (locked) { adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); + up_write(&adev->reset_sem); } /* Trigger recovery for world switch failure if no TDR */ -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx