This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57. We see that the bo validate list gets corrupted, in amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on the next line, references a NULL bo and we get a koops. Bisecting leads to the commit being reverted as the cause of the list corruption. See the link below for details of the corruption failure. Cc: Christian König <christian.koenig@xxxxxxx> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx> Cc: Alex Deucher <Alexander.Deucher@xxxxxxx> Cc: Vitaly Prosyak <Vitaly.Prosyak@xxxxxxx> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539 Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 + 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 36ac1f1d11e6b4..e619031753b927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs goto free_chunk; } + mutex_lock(&p->ctx->lock); + /* skip guilty context job */ if (atomic_read(&p->ctx->guilty) == 1) { ret = -ECANCELED; @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, } } - /* Move fence waiting after getting reservation lock of - * PD root. Then there is no need on a ctx mutex lock. - */ - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); - goto error_validate; - } - amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, &p->bytes_moved_vis_threshold); p->bytes_moved = 0; @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, dma_fence_put(parser->fence); if (parser->ctx) { + mutex_unlock(&parser->ctx->lock); amdgpu_ctx_put(parser->ctx); } if (parser->bo_list) @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, if (parser->job->uf_addr && ring->funcs->no_user_fence) return -EINVAL; - return 0; + return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity); } static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto out; r = amdgpu_cs_submit(&parser, cs); + out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 53f9268366f29e..2ef5296216d64d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, kref_init(&ctx->refcount); ctx->mgr = mgr; spin_lock_init(&ctx->ring_lock); + mutex_init(&ctx->lock); ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref) drm_dev_exit(idx); } + mutex_destroy(&ctx->lock); kfree(ctx); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index 0fa0e56daf67e9..cc7c8afff4144c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -53,6 +53,7 @@ struct amdgpu_ctx { bool preamble_presented; int32_t init_priority; int32_t override_priority; + struct mutex lock; atomic_t guilty; unsigned long ras_counter_ce; unsigned long ras_counter_ue; base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2 -- 2.36.1.74.g277cf0bc36