[Why] After embeded hw_fence to amdgpu_job, we need to add tdr support for this feature. [How] 1. Add a resubmit_flag for resubmit jobs. 2. Clear job fence from RCU and force complete vm flush fences in pre_asic_reset 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put for guilty jobs. Signed-off-by: Jack Zhang <Jack.Zhang1@xxxxxxx> Signed-off-by: Jingwen Chen <Jingwen.Chen2@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 +++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 +++- drivers/gpu/drm/scheduler/sched_main.c | 1 + include/drm/gpu_scheduler.h | 1 + 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 280b1940e892..df73fe666e87 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4382,7 +4382,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, r = 0; + int i, j, r = 0; struct amdgpu_job *job = NULL; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ -4406,6 +4406,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; + /*clear job fence from fence drv to avoid force_completion + *leave NULL and vm flush fence in fence drv */ + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { + struct dma_fence *old,**ptr; + ptr = &ring->fence_drv.fences[j]; + old = rcu_dereference_protected(*ptr, 1); + if (old && test_bit(DMA_FENCE_FLAG_USER_BITS, &old->flags))) { + RCU_INIT_POINTER(*ptr, NULL); + } + } /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index eecf21d8ec33..d5b3d5f8f951 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -157,10 +157,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd } seq = ++ring->fence_drv.sync_seq; - dma_fence_init(fence, &amdgpu_fence_ops, - &ring->fence_drv.lock, - adev->fence_context + ring->idx, - seq); + if (job != NULL && job->base.resubmit_flag == 1) { + /* reset seq for resubmitted jobs */ + fence->seqno = seq; + } else { + dma_fence_init(fence, &amdgpu_fence_ops, + &ring->fence_drv.lock, + adev->fence_context + ring->idx, + seq); + } if (job != NULL) { /* mark this fence has a parent job */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 7c426e225b24..d6f848adc3f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -241,6 +241,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ if (finished->error < 0) { + dma_fence_put(&job->hw_fence); DRM_INFO("Skip scheduling IBs!\n"); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, @@ -249,7 +250,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) DRM_ERROR("Error scheduling IBs (%d)\n", r); } - dma_fence_get(fence); + if (!job->base.resubmit_flag) + dma_fence_get(fence); amdgpu_job_free_resources(job); fence = r ? ERR_PTR(r) : fence; diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index f4f474944169..5a36ab5aea2d 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -544,6 +544,7 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max) dma_fence_set_error(&s_fence->finished, -ECANCELED); dma_fence_put(s_job->s_fence->parent); + s_job->resubmit_flag = 1; fence = sched->ops->run_job(s_job); i++; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 4ea8606d91fe..06c101af1f71 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -198,6 +198,7 @@ struct drm_sched_job { enum drm_sched_priority s_priority; struct drm_sched_entity *entity; struct dma_fence_cb cb; + int resubmit_flag; }; static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, -- 2.25.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx