The guilty handling tried to establish a second way of signaling problems with the GPU back to userspace. This caused quite a bunch of issue we had to work around, especially lifetime issues with the drm_sched_entity. Just drop the handling altogether and use the dma_fence based approach instead. Signed-off-by: Christian König <christian.koenig@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 5 ----- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 25 ++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 1e475eb01417..5c8a6396d924 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -59,11 +59,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, if (!p->ctx) return -EINVAL; - if (atomic_read(&p->ctx->guilty)) { - amdgpu_ctx_put(p->ctx); - return -ECANCELED; - } - amdgpu_sync_create(&p->sync); drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index c43d1b6e5d66..39fe7cc6e34e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -250,7 +250,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip, } r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, num_scheds, - &ctx->guilty); + NULL); if (r) goto error_free_entity; @@ -572,6 +572,27 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, #define AMDGPU_RAS_COUNTE_DELAY_MS 3000 +static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx) +{ + int i, j; + + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { + struct amdgpu_ctx_entity *ctx_entity; + + ctx_entity = ctx->entities[i][j]; + if (ctx_entity) + continue; + + if (drm_sched_entity_error(&ctx_entity->entity) == + -ETIME) + return true; + } + } + + return false; +} + static int amdgpu_ctx_query2(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint32_t id, union drm_amdgpu_ctx_out *out) @@ -600,7 +621,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; - if (atomic_read(&ctx->guilty)) + if (amdgpu_ctx_guilty(ctx)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; if (amdgpu_in_reset(adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index 85376baaa92f..45569cce484e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -53,7 +53,6 @@ struct amdgpu_ctx { bool preamble_presented; int32_t init_priority; int32_t override_priority; - atomic_t guilty; unsigned long ras_counter_ce; unsigned long ras_counter_ue; uint32_t stable_pstate; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index df19271130c6..26f72b8afde9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5330,14 +5330,10 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, r = 0; - struct amdgpu_job *job = NULL; struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); - - if (reset_context->reset_req_dev == adev) - job = reset_context->job; + int i, r; if (amdgpu_sriov_vf(adev)) amdgpu_virt_pre_reset(adev); @@ -5362,9 +5358,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_fence_driver_isr_toggle(adev, false); - if (job && job->vm) - drm_sched_increase_karma(&job->base); - r = amdgpu_reset_prepare_hwcontext(adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ if (r == -EOPNOTSUPP) -- 2.34.1