Re: [RFC PATCH 1/1] drm/amdgpu: Protect the amdgpu_bo_list list with a mutex

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 12.07.22 um 07:39 schrieb Luben Tuikov:
Protect the struct amdgpu_bo_list with a mutex. This is used during command
submission in order to avoid buffer object corruption as recorded in
the link below.

Suggested-by: Christian König <christian.koenig@xxxxxxx>
Cc: Alex Deucher <Alexander.Deucher@xxxxxxx>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
Cc: Vitaly Prosyak <Vitaly.Prosyak@xxxxxxx>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048
Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 31 +++++++++++++++++++--
  3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 714178f1b6c6ed..2168163aad2d38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu)
  {
  	struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list,
  						   rhead);
-
+	mutex_destroy(&list->bo_list_mutex);
  	kvfree(list);
  }
@@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, trace_amdgpu_cs_bo_status(list->num_entries, total_size); + mutex_init(&list->bo_list_mutex);
  	*result = list;
  	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 044b41f0bfd9ce..717984d4de6858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -48,6 +48,10 @@ struct amdgpu_bo_list {
  	struct amdgpu_bo *oa_obj;
  	unsigned first_userptr;
  	unsigned num_entries;
+
+	/* Protect access during command submission.
+	 */
+	struct mutex bo_list_mutex;
  };
int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..0b2932c20ec777 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -517,6 +517,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  			return r;
  	}
+ mutex_lock(&p->bo_list->bo_list_mutex);

That lock/unlock placement is not correct and probably the reason why you still run into trouble with this patch.

You need to grab the lock before the call to amdgpu_bo_list_get_list() and drop it either after a call to ttm_eu_backoff_reservation() or ttm_eu_fence_buffer_objects().

If the lock is dropped anywhere in between we would have list corruption again.

Regards,
Christian.

+
  	/* One for TTM and one for the CS job */
  	amdgpu_bo_list_for_each_entry(e, p->bo_list)
  		e->tv.num_shared = 2;
@@ -544,6 +546,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  		if (!e->user_pages) {
  			DRM_ERROR("kvmalloc_array failure\n");
  			r = -ENOMEM;
+			mutex_unlock(&p->bo_list->bo_list_mutex);
  			goto out_free_user_pages;
  		}
@@ -551,6 +554,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  		if (r) {
  			kvfree(e->user_pages);
  			e->user_pages = NULL;
+			mutex_unlock(&p->bo_list->bo_list_mutex);
  			goto out_free_user_pages;
  		}
@@ -568,6 +572,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  	if (unlikely(r != 0)) {
  		if (r != -ERESTARTSYS)
  			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
+		mutex_unlock(&p->bo_list->bo_list_mutex);
  		goto out_free_user_pages;
  	}
@@ -580,11 +585,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  			e->chain = dma_fence_chain_alloc();
  			if (!e->chain) {
  				r = -ENOMEM;
+				mutex_unlock(&p->bo_list->bo_list_mutex);
  				goto error_validate;
  			}
  		}
  	}
+ mutex_unlock(&p->bo_list->bo_list_mutex);






+
  	/* Move fence waiting after getting reservation lock of
  	 * PD root. Then there is no need on a ctx mutex lock.
  	 */
@@ -607,6 +615,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  		goto error_validate;
  	}
+ mutex_lock(&p->bo_list->bo_list_mutex);
  	r = amdgpu_cs_list_validate(p, &duplicates);
  	if (r)
  		goto error_validate;
@@ -614,6 +623,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  	r = amdgpu_cs_list_validate(p, &p->validated);
  	if (r)
  		goto error_validate;
+	mutex_unlock(&p->bo_list->bo_list_mutex);
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
  				     p->bytes_moved_vis);
@@ -644,15 +654,18 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
error_validate:
  	if (r) {
+		mutex_lock(&p->bo_list->bo_list_mutex);
  		amdgpu_bo_list_for_each_entry(e, p->bo_list) {
  			dma_fence_chain_free(e->chain);
  			e->chain = NULL;
  		}
  		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
+		mutex_unlock(&p->bo_list->bo_list_mutex);
  	}
out_free_user_pages:
  	if (r) {
+		mutex_lock(&p->bo_list->bo_list_mutex);
  		amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
  			struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -662,6 +675,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
  			kvfree(e->user_pages);
  			e->user_pages = NULL;
  		}
+		mutex_unlock(&p->bo_list->bo_list_mutex);
  	}
  	return r;
  }
@@ -704,6 +718,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
  	if (error && backoff) {
  		struct amdgpu_bo_list_entry *e;
+ mutex_lock(&parser->bo_list->bo_list_mutex);
  		amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
  			dma_fence_chain_free(e->chain);
  			e->chain = NULL;
@@ -711,6 +726,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
ttm_eu_backoff_reservation(&parser->ticket,
  					   &parser->validated);
+		mutex_unlock(&parser->bo_list->bo_list_mutex);
  	}
for (i = 0; i < parser->num_post_deps; i++) {
@@ -839,6 +855,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
  			return r;
  	}
+ mutex_lock(&p->bo_list->bo_list_mutex);
  	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
  		/* ignore duplicates */
  		bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -850,13 +867,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
  			continue;
r = amdgpu_vm_bo_update(adev, bo_va, false);
-		if (r)
+		if (r) {
+			mutex_unlock(&p->bo_list->bo_list_mutex);
  			return r;
+		}
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
-		if (r)
+		if (r) {
+			mutex_unlock(&p->bo_list->bo_list_mutex);
  			return r;
+		}
  	}
+	mutex_unlock(&p->bo_list->bo_list_mutex);
r = amdgpu_vm_handle_moved(adev, vm);
  	if (r)
@@ -874,6 +896,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (amdgpu_vm_debug) {
  		/* Invalidate all BOs to test for userspace bugs */
+		mutex_lock(&p->bo_list->bo_list_mutex);
  		amdgpu_bo_list_for_each_entry(e, p->bo_list) {
  			struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -883,6 +906,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) amdgpu_vm_bo_invalidate(adev, bo, false);
  		}
+		mutex_unlock(&p->bo_list->bo_list_mutex);
  	}
return amdgpu_cs_sync_rings(p);
@@ -1249,6 +1273,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
  	 * added to BOs.
  	 */
  	mutex_lock(&p->adev->notifier_lock);
+	mutex_lock(&p->bo_list->bo_list_mutex);
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
  	 * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
@@ -1308,12 +1333,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
  	}
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
+	mutex_unlock(&p->bo_list->bo_list_mutex);
  	mutex_unlock(&p->adev->notifier_lock);
return 0; error_abort:
  	drm_sched_job_cleanup(&job->base);
+	mutex_unlock(&p->bo_list->bo_list_mutex);
  	mutex_unlock(&p->adev->notifier_lock);
error_unlock:




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux