On 4/12/19 3:39 AM, Christian König wrote: > Am 11.04.19 um 18:03 schrieb Andrey Grodzovsky: >> Also reject TDRs if another one already running. >> >> v2: >> Stop all schedulers across device and entire XGMI hive before >> force signaling HW fences. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 125 >> ++++++++++++++++++++--------- >> 1 file changed, 88 insertions(+), 37 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index aabd043..ce9c494 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3327,7 +3327,8 @@ bool amdgpu_device_should_recover_gpu(struct >> amdgpu_device *adev) >> static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, >> struct amdgpu_job *job, >> - bool *need_full_reset_arg) >> + bool *need_full_reset_arg, >> + bool job_signaled) >> { >> int i, r = 0; >> bool need_full_reset = *need_full_reset_arg; >> @@ -3339,8 +3340,6 @@ static int amdgpu_device_pre_asic_reset(struct >> amdgpu_device *adev, >> if (!ring || !ring->sched.thread) >> continue; >> - drm_sched_stop(&ring->sched, &job->base); >> - >> /* after all hw jobs are reset, hw fence is meaningless, so >> force_completion */ >> amdgpu_fence_driver_force_completion(ring); >> } >> @@ -3358,7 +3357,8 @@ static int amdgpu_device_pre_asic_reset(struct >> amdgpu_device *adev, >> - if (!amdgpu_sriov_vf(adev)) { >> + /* Don't suspend on bare metal if we are not going to HW reset >> the ASIC */ >> + if (!amdgpu_sriov_vf(adev) && !job_signaled) { >> if (!need_full_reset) >> need_full_reset = amdgpu_device_ip_need_full_reset(adev); >> @@ -3495,7 +3495,7 @@ static int amdgpu_do_asic_reset(struct >> amdgpu_hive_info *hive, >> return r; >> } >> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev) >> +static void amdgpu_device_post_asic_reset(struct amdgpu_device >> *adev, bool job_signaled) >> { >> int i; >> @@ -3505,7 +3505,8 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev) >> if (!ring || !ring->sched.thread) >> continue; >> - if (!adev->asic_reset_res) >> + /* No point to resubmit jobs if we didn't HW reset*/ >> + if (!adev->asic_reset_res && !job_signaled) >> drm_sched_resubmit_jobs(&ring->sched); >> drm_sched_start(&ring->sched, !adev->asic_reset_res); >> @@ -3518,14 +3519,21 @@ static void >> amdgpu_device_post_asic_reset(struct amdgpu_device *adev) >> adev->asic_reset_res = 0; >> } >> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev) >> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool >> trylock) >> { >> - mutex_lock(&adev->lock_reset); >> + if (trylock) { >> + if (!mutex_trylock(&adev->lock_reset)) >> + return false; >> + } else >> + mutex_lock(&adev->lock_reset); >> + >> atomic_inc(&adev->gpu_reset_counter); >> adev->in_gpu_reset = 1; >> /* Block kfd: SRIOV would do it separately */ >> if (!amdgpu_sriov_vf(adev)) >> amdgpu_amdkfd_pre_reset(adev); >> + >> + return true; >> } >> static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) >> @@ -3553,38 +3561,43 @@ static void amdgpu_device_unlock_adev(struct >> amdgpu_device *adev) >> int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >> struct amdgpu_job *job) >> { >> - int r; >> + int r, i; > > BTW: Usual kernel coding style is to use reverse xmas tree. E.g. > variables like "int i, r" last in the declarations. > >> struct amdgpu_hive_info *hive = NULL; >> - bool need_full_reset = false; >> struct amdgpu_device *tmp_adev = NULL; >> struct list_head device_list, *device_list_handle = NULL; >> + bool xgmi_topology_present, need_full_reset, job_signaled; >> + need_full_reset = job_signaled = false; >> INIT_LIST_HEAD(&device_list); >> dev_info(adev->dev, "GPU reset begin!\n"); >> + hive = amdgpu_get_xgmi_hive(adev, 0); > > The second parameter should actually be a bool, so please use false here. > >> + xgmi_topology_present = hive && >> adev->gmc.xgmi.num_physical_nodes > 1; > > Why are we actually checking num_physical_nodes here? That the usual way of knowing you have XGMI topology, but having hive!=NULL should be equivalent so I can remove it. > >> + >> /* >> - * In case of XGMI hive disallow concurrent resets to be triggered >> - * by different nodes. No point also since the one node already >> executing >> - * reset will also reset all the other nodes in the hive. >> + * Here we trylock to avoid chain of resets executing from >> + * either trigger by jobs on different adevs in XGMI hive or >> jobs on >> + * different schedulers for same device while this tdr is running. > > Can we please completely drop the name "tdr" and just write "timeout > handler"? > >> + * We always reset all schedulers for device and all devices for >> XGMI >> + * hive so that should take care of them too. >> */ >> - hive = amdgpu_get_xgmi_hive(adev, 0); >> - if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >> - !mutex_trylock(&hive->reset_lock)) >> + >> + if (xgmi_topology_present && !mutex_trylock(&hive->reset_lock)) { >> + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as >> another already in progress", >> + job->base.id, hive->hive_id); >> return 0; >> + } >> /* Start with adev pre asic reset first for soft reset check.*/ >> - amdgpu_device_lock_adev(adev); >> - r = amdgpu_device_pre_asic_reset(adev, >> - job, >> - &need_full_reset); >> - if (r) { >> - /*TODO Should we stop ?*/ >> - DRM_ERROR("GPU pre asic reset failed with err, %d for drm >> dev, %s ", >> - r, adev->ddev->unique); >> - adev->asic_reset_res = r; >> + if (!amdgpu_device_lock_adev(adev, !xgmi_topology_present)) { >> + DRM_INFO("Bailing on TDR for s_job:%llx, as another already >> in progress", >> + job->base.id); >> + return 0; >> } >> + need_full_reset = amdgpu_device_ip_need_full_reset(adev); >> + >> /* Build list of devices to reset */ >> if (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) { > > We should probably unconditionally stop all nodes on all device first > and then try to figure out if we actually need a full reset. > >> if (!hive) { >> @@ -3603,16 +3616,52 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> device_list_handle = &device_list; >> } >> + /* block all schedulers and reset given job's ring */ >> + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >> + for (i = 0; i < AMDGPU_MAX_RINGSBuild list of devices to reset >> ; ++i) { >> + struct amdgpu_ring *ring = tmp_adev->rings[i]; >> + >> + if (!ring || !ring->sched.thread) >> + continue; >> + >> + drm_sched_stop(&ring->sched, &job->base); >> + } >> + } >> + >> + /* >> + * Must check guilty signal here since after this point all old >> + * HW fences are force signaled. >> + * >> + * job->base holds a reference to parent fence >> + */ >> + if (job && job->base.s_fence->parent && >> + dma_fence_is_signaled(job->base.s_fence->parent)) >> + job_signaled = true; > > It would probably be better if we have a "goto abort;" or something > similar here. > > This also avoids giving the job_signaled variable to all the sub > functions. > > Christian. I agree this would be good in case of amdgpu_device_pre_asic_reset because we can totally skip this function if guilty job already signaled, but for amdgpu_device_post_asic_reset it crates complications because drm_sched_start is right in the middle there after drm_sched_resubmit_jobs but before forcing back set mode in display so I prefer to keep passing 'job_signaled' to amdgpu_device_post_asic_reset. Alternative is to get rid of this function and bring it's body into amdgpu_device_gpu_recover which is already pretty cluttered and confusing. What do you think ? Andrey > >> + >> + >> + /* Guilty job will be freed after this*/ >> + r = amdgpu_device_pre_asic_reset(adev, >> + job, >> + &need_full_reset, >> + job_signaled); >> + if (r) { >> + /*TODO Should we stop ?*/ >> + DRM_ERROR("GPU pre asic reset failed with err, %d for drm >> dev, %s ", >> + r, adev->ddev->unique); >> + adev->asic_reset_res = r; >> + } >> + >> retry: /* Rest of adevs pre asic reset from XGMI hive. */ >> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >> if (tmp_adev == adev) >> continue; >> - amdgpu_device_lock_adev(tmp_adev); >> + amdgpu_device_lock_adev(tmp_adev, false); >> r = amdgpu_device_pre_asic_reset(tmp_adev, >> NULL, >> - &need_full_reset); >> + &need_full_reset, >> + job_signaled); >> /*TODO Should we stop ?*/ >> if (r) { >> DRM_ERROR("GPU pre asic reset failed with err, %d for >> drm dev, %s ", >> @@ -3623,19 +3672,21 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> /* Actual ASIC resets if needed.*/ >> /* TODO Implement XGMI hive reset logic for SRIOV */ >> - if (amdgpu_sriov_vf(adev)) { >> - r = amdgpu_device_reset_sriov(adev, job ? false : true); >> - if (r) >> - adev->asic_reset_res = r; >> - } else { >> - r = amdgpu_do_asic_reset(hive, device_list_handle, >> &need_full_reset); >> - if (r && r == -EAGAIN) >> - goto retry; >> + if (!job || !job_signaled) { >> + if (amdgpu_sriov_vf(adev)) { >> + r = amdgpu_device_reset_sriov(adev, job ? false : true); >> + if (r) >> + adev->asic_reset_res = r; >> + } else { >> + r = amdgpu_do_asic_reset(hive, device_list_handle, >> &need_full_reset); >> + if (r && r == -EAGAIN) >> + goto retry; >> + } >> } >> /* Post ASIC reset for all devs .*/ >> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >> - amdgpu_device_post_asic_reset(tmp_adev); >> + amdgpu_device_post_asic_reset(tmp_adev, job_signaled); >> if (r) { >> /* bad news, how to tell it to userspace ? */ >> @@ -3648,7 +3699,7 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> amdgpu_device_unlock_adev(tmp_adev); >> } >> - if (hive && adev->gmc.xgmi.num_physical_nodes > 1) >> + if (xgmi_topology_present) >> mutex_unlock(&hive->reset_lock); >> if (r) > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx