Re: [PATCH 2/4] drm/amdgpu: fix userptr HMM range handling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 2022-11-15 um 08:37 schrieb Christian König:
Am 10.11.22 um 22:55 schrieb Felix Kuehling:

Am 2022-11-10 um 08:00 schrieb Christian König:
The basic problem here is that it's not allowed to page fault while
holding the reservation lock.

So it can happen that multiple processes try to validate an userptr
at the same time.

Work around that by putting the HMM range object into the mutex
protected bo list for now.

Signed-off-by: Christian König <christian.koenig@xxxxxxx>
CC: stable@xxxxxxxxxxxxxxx
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 12 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c   |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h   |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  8 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  6 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 50 +++++--------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h       | 14 ++++--
  7 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c5c9bfa2772e..83659e6419a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -940,6 +940,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
      struct amdkfd_process_info *process_info = mem->process_info;
      struct amdgpu_bo *bo = mem->bo;
      struct ttm_operation_ctx ctx = { true, false };
+    struct hmm_range *range;

I'd feel better if these local hmm_range pointers here and in update_invalid_user_pages and amdgpu_gem_userptr_ioctl were initialized to NULL. amdgpu_ttm_tt_get_user_pages leaves it uninitialized in case of errors and amdgpu_ttm_tt_get_user_pages_done checks for !range.

I've opted to initializing range to NULL in case of an error.


With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>

Does that still counts?

If you don't mind, please send the updated patch again. It's always good to have a second pair of eyes on error handling.

Thanks,
  Felix



Thanks,
Christian.



      int ret = 0;
        mutex_lock(&process_info->lock);
@@ -969,7 +970,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
          return 0;
      }
  -    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+    ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
      if (ret) {
          pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
          goto unregister_out;
@@ -987,7 +988,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
      amdgpu_bo_unreserve(bo);
    release_out:
-    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+    amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
  unregister_out:
      if (ret)
          amdgpu_mn_unregister(bo);
@@ -2319,6 +2320,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,       /* Go through userptr_inval_list and update any invalid user_pages */
      list_for_each_entry(mem, &process_info->userptr_inval_list,
                  validate_list.head) {
+        struct hmm_range *range;
+
          invalid = atomic_read(&mem->invalid);
          if (!invalid)
              /* BO hasn't been invalidated since the last
@@ -2329,7 +2332,8 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
          bo = mem->bo;
            /* Get updated user pages */
-        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+        ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
+                           &range);
          if (ret) {
              pr_debug("Failed %d to get user pages\n", ret);
  @@ -2348,7 +2352,7 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
               * FIXME: Cannot ignore the return code, must hold
               * notifier_lock
               */
- amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+            amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
          }
            /* Mark the BO as valid unless it was invalidated
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 2168163aad2d..252a876b0725 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -209,6 +209,7 @@ void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
              list_add_tail(&e->tv.head, &bucket[priority]);
            e->user_pages = NULL;
+        e->range = NULL;
      }
        /* Connect the sorted buckets in the output list. */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 9caea1688fc3..e4d78491bcc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -26,6 +26,8 @@
  #include <drm/ttm/ttm_execbuf_util.h>
  #include <drm/amdgpu_drm.h>
  +struct hmm_range;
+
  struct amdgpu_device;
  struct amdgpu_bo;
  struct amdgpu_bo_va;
@@ -36,6 +38,7 @@ struct amdgpu_bo_list_entry {
      struct amdgpu_bo_va        *bo_va;
      uint32_t            priority;
      struct page            **user_pages;
+    struct hmm_range        *range;
      bool                user_invalidated;
  };
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d371000a5727..7f9cedd8e157 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -910,7 +910,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
              goto out_free_user_pages;
          }
  -        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
+        r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range);
          if (r) {
              kvfree(e->user_pages);
              e->user_pages = NULL;
@@ -988,9 +988,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
            if (!e->user_pages)
              continue;
-        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
          kvfree(e->user_pages);
          e->user_pages = NULL;
+        e->range = NULL;
      }
      mutex_unlock(&p->bo_list->bo_list_mutex);
      return r;
@@ -1265,7 +1266,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
      amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
          struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
  -        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+        r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
+        e->range = NULL;
      }
      if (r) {
          r = -EAGAIN;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 111484ceb47d..91571b1324f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -378,6 +378,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
      struct amdgpu_device *adev = drm_to_adev(dev);
      struct drm_amdgpu_gem_userptr *args = data;
      struct drm_gem_object *gobj;
+    struct hmm_range *range;
      struct amdgpu_bo *bo;
      uint32_t handle;
      int r;
@@ -418,7 +419,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
          goto release_object;
        if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
-        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+        r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
+                         &range);
          if (r)
              goto release_object;
  @@ -441,7 +443,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
    user_pages_done:
      if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
-        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
    release_object:
      drm_gem_object_put(gobj);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 76a8ebfc9e71..a56d28bd23be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -642,9 +642,6 @@ struct amdgpu_ttm_tt {
      struct task_struct    *usertask;
      uint32_t        userflags;
      bool            bound;
-#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
-    struct hmm_range    *range;
-#endif
  };
    #define ttm_to_amdgpu_ttm_tt(ptr)    container_of(ptr, struct amdgpu_ttm_tt, ttm)
@@ -657,7 +654,8 @@ struct amdgpu_ttm_tt {
   * Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
   * once afterwards to stop HMM tracking
   */
-int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) +int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages,
+                 struct hmm_range **range)
  {
      struct ttm_tt *ttm = bo->tbo.ttm;
      struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
@@ -673,10 +671,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
          return -EFAULT;
      }
  -    /* Another get_user_pages is running at the same time?? */
-    if (WARN_ON(gtt->range))
-        return -EFAULT;
-
      if (!mmget_not_zero(mm)) /* Happens during process shutdown */
          return -ESRCH;
  @@ -694,7 +688,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
        readonly = amdgpu_ttm_tt_is_readonly(ttm);
      r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
-                       ttm->num_pages, &gtt->range, readonly,
+                       ttm->num_pages, range, readonly,
                         true, NULL);
  out_unlock:
      mmap_read_unlock(mm);
@@ -712,30 +706,24 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
   *
   * Returns: true if pages are still valid
   */
-bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
+bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
+                       struct hmm_range *range)
  {
      struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm);
-    bool r = false;
  -    if (!gtt || !gtt->userptr)
+    if (!gtt || !gtt->userptr || !range)
          return false;
        DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n",
          gtt->userptr, ttm->num_pages);
  -    WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
-        "No user pages to check\n");
+    WARN_ONCE(!range->hmm_pfns, "No user pages to check\n");
  -    if (gtt->range) {
-        /*
-         * FIXME: Must always hold notifier_lock for this, and must
-         * not ignore the return code.
-         */
-        r = amdgpu_hmm_range_get_pages_done(gtt->range);
-        gtt->range = NULL;
-    }
-
-    return !r;
+    /*
+     * FIXME: Must always hold notifier_lock for this, and must
+     * not ignore the return code.
+     */
+    return !amdgpu_hmm_range_get_pages_done(range);
  }
  #endif
  @@ -812,20 +800,6 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_device *bdev,
      /* unmap the pages mapped to the device */
      dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
      sg_free_table(ttm->sg);
-
-#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
-    if (gtt->range) {
-        unsigned long i;
-
-        for (i = 0; i < ttm->num_pages; i++) {
-            if (ttm->pages[i] !=
- hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
-                break;
-        }
-
-        WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
-    }
-#endif
  }
    static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 6a70818039dd..a37207011a69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -39,6 +39,8 @@
    #define AMDGPU_POISON    0xd0bed0be
  +struct hmm_range;
+
  struct amdgpu_gtt_mgr {
      struct ttm_resource_manager manager;
      struct drm_mm mm;
@@ -149,15 +151,19 @@ void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);   uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t type);
    #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
-int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages);
-bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
+int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages,
+                 struct hmm_range **range);
+bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
+                       struct hmm_range *range);
  #else
  static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo,
-                           struct page **pages)
+                           struct page **pages,
+                           struct hmm_range **range)
  {
      return -EPERM;
  }
-static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm,
+                             struct hmm_range *range)
  {
      return false;
  }




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux