Re: [RFC PATCH 1/1] drm/amdgpu: Protect the amdgpu_bo_list list with a mutex

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi guys,

I need this fixed for the gang submit branch. So I'm going to pick up this patch and make the necessary modifications to make it stable.

Thanks,
Christian.

Am 12.07.22 um 10:07 schrieb Christian König:
Am 12.07.22 um 07:39 schrieb Luben Tuikov:
Protect the struct amdgpu_bo_list with a mutex. This is used during command
submission in order to avoid buffer object corruption as recorded in
the link below.

Suggested-by: Christian König <christian.koenig@xxxxxxx>
Cc: Alex Deucher <Alexander.Deucher@xxxxxxx>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
Cc: Vitaly Prosyak <Vitaly.Prosyak@xxxxxxx>
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2048&amp;data=05%7C01%7Cchristian.koenig%40amd.com%7C1fc0be908ec3423f396c08da63dd8b6d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637932100428991697%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=KzScTABB44b7TxNlbCe2gNU%2F%2F9om5JyvK88SeJ5SBus%3D&amp;reserved=0
Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 31 +++++++++++++++++++--
  3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 714178f1b6c6ed..2168163aad2d38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu)
  {
      struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list,
                             rhead);
-
+    mutex_destroy(&list->bo_list_mutex);
      kvfree(list);
  }
  @@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
        trace_amdgpu_cs_bo_status(list->num_entries, total_size);
  +    mutex_init(&list->bo_list_mutex);
      *result = list;
      return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 044b41f0bfd9ce..717984d4de6858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -48,6 +48,10 @@ struct amdgpu_bo_list {
      struct amdgpu_bo *oa_obj;
      unsigned first_userptr;
      unsigned num_entries;
+
+    /* Protect access during command submission.
+     */
+    struct mutex bo_list_mutex;
  };
    int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..0b2932c20ec777 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -517,6 +517,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
              return r;
      }
  +    mutex_lock(&p->bo_list->bo_list_mutex);

That lock/unlock placement is not correct and probably the reason why you still run into trouble with this patch.

You need to grab the lock before the call to amdgpu_bo_list_get_list() and drop it either after a call to ttm_eu_backoff_reservation() or ttm_eu_fence_buffer_objects().

If the lock is dropped anywhere in between we would have list corruption again.

Regards,
Christian.

+
      /* One for TTM and one for the CS job */
      amdgpu_bo_list_for_each_entry(e, p->bo_list)
          e->tv.num_shared = 2;
@@ -544,6 +546,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
          if (!e->user_pages) {
              DRM_ERROR("kvmalloc_array failure\n");
              r = -ENOMEM;
+            mutex_unlock(&p->bo_list->bo_list_mutex);
              goto out_free_user_pages;
          }
  @@ -551,6 +554,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
          if (r) {
              kvfree(e->user_pages);
              e->user_pages = NULL;
+            mutex_unlock(&p->bo_list->bo_list_mutex);
              goto out_free_user_pages;
          }
  @@ -568,6 +572,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
      if (unlikely(r != 0)) {
          if (r != -ERESTARTSYS)
              DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
+        mutex_unlock(&p->bo_list->bo_list_mutex);
          goto out_free_user_pages;
      }
  @@ -580,11 +585,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
              e->chain = dma_fence_chain_alloc();
              if (!e->chain) {
                  r = -ENOMEM;
+ mutex_unlock(&p->bo_list->bo_list_mutex);
                  goto error_validate;
              }
          }
      }
  +    mutex_unlock(&p->bo_list->bo_list_mutex);






+
      /* Move fence waiting after getting reservation lock of
       * PD root. Then there is no need on a ctx mutex lock.
       */
@@ -607,6 +615,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
          goto error_validate;
      }
  +    mutex_lock(&p->bo_list->bo_list_mutex);
      r = amdgpu_cs_list_validate(p, &duplicates);
      if (r)
          goto error_validate;
@@ -614,6 +623,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
      r = amdgpu_cs_list_validate(p, &p->validated);
      if (r)
          goto error_validate;
+    mutex_unlock(&p->bo_list->bo_list_mutex);
        amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
                       p->bytes_moved_vis);
@@ -644,15 +654,18 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
    error_validate:
      if (r) {
+        mutex_lock(&p->bo_list->bo_list_mutex);
          amdgpu_bo_list_for_each_entry(e, p->bo_list) {
              dma_fence_chain_free(e->chain);
              e->chain = NULL;
          }
          ttm_eu_backoff_reservation(&p->ticket, &p->validated);
+        mutex_unlock(&p->bo_list->bo_list_mutex);
      }
    out_free_user_pages:
      if (r) {
+        mutex_lock(&p->bo_list->bo_list_mutex);
          amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
              struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
  @@ -662,6 +675,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
              kvfree(e->user_pages);
              e->user_pages = NULL;
          }
+        mutex_unlock(&p->bo_list->bo_list_mutex);
      }
      return r;
  }
@@ -704,6 +718,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
      if (error && backoff) {
          struct amdgpu_bo_list_entry *e;
  + mutex_lock(&parser->bo_list->bo_list_mutex);
          amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
              dma_fence_chain_free(e->chain);
              e->chain = NULL;
@@ -711,6 +726,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
            ttm_eu_backoff_reservation(&parser->ticket,
                         &parser->validated);
+ mutex_unlock(&parser->bo_list->bo_list_mutex);
      }
        for (i = 0; i < parser->num_post_deps; i++) {
@@ -839,6 +855,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
              return r;
      }
  +    mutex_lock(&p->bo_list->bo_list_mutex);
      amdgpu_bo_list_for_each_entry(e, p->bo_list) {
          /* ignore duplicates */
          bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -850,13 +867,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
              continue;
            r = amdgpu_vm_bo_update(adev, bo_va, false);
-        if (r)
+        if (r) {
+            mutex_unlock(&p->bo_list->bo_list_mutex);
              return r;
+        }
            r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
-        if (r)
+        if (r) {
+            mutex_unlock(&p->bo_list->bo_list_mutex);
              return r;
+        }
      }
+    mutex_unlock(&p->bo_list->bo_list_mutex);
        r = amdgpu_vm_handle_moved(adev, vm);
      if (r)
@@ -874,6 +896,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
        if (amdgpu_vm_debug) {
          /* Invalidate all BOs to test for userspace bugs */
+        mutex_lock(&p->bo_list->bo_list_mutex);
          amdgpu_bo_list_for_each_entry(e, p->bo_list) {
              struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
  @@ -883,6 +906,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
                amdgpu_vm_bo_invalidate(adev, bo, false);
          }
+        mutex_unlock(&p->bo_list->bo_list_mutex);
      }
        return amdgpu_cs_sync_rings(p);
@@ -1249,6 +1273,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
       * added to BOs.
       */
      mutex_lock(&p->adev->notifier_lock);
+    mutex_lock(&p->bo_list->bo_list_mutex);
        /* If userptr are invalidated after amdgpu_cs_parser_bos(), return
       * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
@@ -1308,12 +1333,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
      }
        ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
+    mutex_unlock(&p->bo_list->bo_list_mutex);
      mutex_unlock(&p->adev->notifier_lock);
        return 0;
    error_abort:
      drm_sched_job_cleanup(&job->base);
+    mutex_unlock(&p->bo_list->bo_list_mutex);
      mutex_unlock(&p->adev->notifier_lock);
    error_unlock:





[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux