Re: [PATCH] drm/amdgpu: Unlocked unmap only clear page table leaves

Philip Yang <yangp@xxxxxxx> · Mon, 20 Jan 2025 10:59:55 -0500



    On 2025-01-15 06:01, Christian König
      wrote:

    
    Am
      14.01.25 um 15:53 schrieb Philip Yang:
      

      SVM migration unmap pages from GPU and
        then update mapping to GPU to
        

        recover page fault. Currently unmap clears the PDE entry for
        range
        

        length >= huge page and free PTB bo, update mapping to alloc
        new PT bo.
        

        There is race bug that the freed entry bo maybe still on the
        pt_free
        

        list, reused when updating mapping and then freed, leave invalid
        PDE
        

        entry and cause GPU page fault.
        

        By setting the update to clear only one PDE entry or clear PTB,
        to
        

        avoid unmap to free PTE bo. This fixes the race bug and improve
        the
        

        unmap and map to GPU performance. Update mapping to huge page
        will
        

        still free the PTB bo.
        

        With this change, the vm->pt_freed list and work is not
        needed. Add
        

        WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap to
        free the
        

        PTB.
        

        Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
        

        ---
        

          drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  4 ---
        

          drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 ---
        

          drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43
        +++++++----------------
        

          3 files changed, 13 insertions(+), 38 deletions(-)
        

        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
        

        index c9c48b782ec1..48b2c0b3b315 100644
        

        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
        

        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
        

        @@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct amdgpu_device
        *adev, struct amdgpu_vm *vm,
        

              spin_lock_init(&vm->status_lock);
        

              INIT_LIST_HEAD(&vm->freed);
        

              INIT_LIST_HEAD(&vm->done);
        

        -    INIT_LIST_HEAD(&vm->pt_freed);
        

        -    INIT_WORK(&vm->pt_free_work,
        amdgpu_vm_pt_free_work);
        

              INIT_KFIFO(vm->faults);
        

                r = amdgpu_vm_init_entities(adev, vm);
        

        @@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct amdgpu_device
        *adev, struct amdgpu_vm *vm)
        

                amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
        

          -    flush_work(&vm->pt_free_work);
        

        -
        

              root = amdgpu_bo_ref(vm->root.bo);
        

              amdgpu_bo_reserve(root, true);
        

              amdgpu_vm_put_task_info(vm->task_info);
        

        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
        

        index 5d119ac26c4f..160889e5e64d 100644
        

        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
        

        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
        

        @@ -369,10 +369,6 @@ struct amdgpu_vm {
        

              /* BOs which are invalidated, has been updated in the PTs
        */
        

              struct list_head        done;
        

          -    /* PT BOs scheduled to free and fill with zero if vm_resv
        is not hold */
        

        -    struct list_head    pt_freed;
        

        -    struct work_struct    pt_free_work;
        

        -
        

              /* contains the page directory */
        

              struct amdgpu_vm_bo_base     root;
        

              struct dma_fence    *last_update;
        

        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
        

        index f78a0434a48f..54ae0e9bc6d7 100644
        

        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
        

        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
        

        @@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct
        amdgpu_vm_bo_base *entry)
        

              amdgpu_bo_unref(&entry->bo);
        

          }
        

          -void amdgpu_vm_pt_free_work(struct work_struct *work)
        

        -{
        

        -    struct amdgpu_vm_bo_base *entry, *next;
        

        -    struct amdgpu_vm *vm;
        

        -    LIST_HEAD(pt_freed);
        

        -
        

        -    vm = container_of(work, struct amdgpu_vm, pt_free_work);
        

        -
        

        -    spin_lock(&vm->status_lock);
        

        -    list_splice_init(&vm->pt_freed, &pt_freed);
        

        -    spin_unlock(&vm->status_lock);
        

        -
        

        -    /* flush_work in amdgpu_vm_fini ensure vm->root.bo is
        valid. */
        

        -    amdgpu_bo_reserve(vm->root.bo, true);
        

        -
        

        -    list_for_each_entry_safe(entry, next, &pt_freed,
        vm_status)
        

        -        amdgpu_vm_pt_free(entry);
        

        -
        

        -    amdgpu_bo_unreserve(vm->root.bo);
        

        -}
        

        -
        

          /**
        

           * amdgpu_vm_pt_free_list - free PD/PT levels
        

           *
        

        @@ -579,19 +558,15 @@ void amdgpu_vm_pt_free_list(struct
        amdgpu_device *adev,
        

                          struct amdgpu_vm_update_params *params)
        

          {
        

              struct amdgpu_vm_bo_base *entry, *next;
        

        -    struct amdgpu_vm *vm = params->vm;
        

              bool unlocked = params->unlocked;
        

                if (list_empty(&params->tlb_flush_waitlist))
        

                  return;
        

          -    if (unlocked) {
        

        -        spin_lock(&vm->status_lock);
        

        -        list_splice_init(&params->tlb_flush_waitlist,
        &vm->pt_freed);
        

        -        spin_unlock(&vm->status_lock);
        

        -        schedule_work(&vm->pt_free_work);
        

        -        return;
        

        -    }
        

        +    /*
        

        +     * unlocked unmap clear page table leaves, warning to free
        the page entry.
        

        +     */
        

        +    WARN_ON(unlocked);
        

                list_for_each_entry_safe(entry, next,
        &params->tlb_flush_waitlist, vm_status)
        

                  amdgpu_vm_pt_free(entry);
        

        @@ -899,7 +874,15 @@ int amdgpu_vm_ptes_update(struct
        amdgpu_vm_update_params *params,
        

                  incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE << shift;
        

                  mask = amdgpu_vm_pt_entries_mask(adev, cursor.level);
        

                  pe_start = ((cursor.pfn >> shift) & mask) *
        8;
        

        -        entry_end = ((uint64_t)mask + 1) << shift;
        

        +
        

        +        if (cursor.level < AMDGPU_VM_PTB &&
        params->unlocked)
        

        +            /*
        

        +             * MMU notifier callback unlocked unmap huge page,
        leave is PDE entry,
        

        +             * only clear one entry. Next entry search again
        for PDE or PTE leave.
        

        +             */
        

        +            entry_end = 1ULL << shift;
        

        +        else
        

        +            entry_end = ((uint64_t)mask + 1) << shift;
        

      That here looks to much like a hack for my taste. entry_end
      basically denotes the end of the pages tables and not the updated
      region.
      

    yes, agree.

    
      We already set frag_end to this here:
      

      frag_end = max(frag_end, ALIGN(frag_start + 1, 1ULL <<
      shift));
      

    Here seems the root cause, for example, unmapping frag_start is
      8MB aligned address, frag_end is frag_start + 8MB, but for
      unlocked update we want to do page walk again after unmapping 2MB.

    
    Also the max(....) seems meaningless, as frag_end is always >=
      frag_start + 1, this can be changed to
    if (params->unlocked)
       frag_end = frag_start + 1;
    

      Which basically means that we should update just one entry at the
      time and then walk the page tables again.
      

      The issue is just that we immediately calculate a new fragment
      after each update:
      

                              if (frag_start >= frag_end) {
      

                                      /* figure out the next fragment */
      

                                      amdgpu_vm_pte_fragment(params,
      frag_start, end,
      

                                                             flags,
      &frag, &frag_end);
      

                                      if (frag < shift)
      

                                              break;
      

                              }
      

      And that looks like the place we need to adjust something to allow
      updates of the leave nodes.
      

    yes, also we should always break here to restart page walk for
      unlocked unmapping

    
    if (frag < shift || params->unlocked)
         break;

    
      Alternatively I wouldn't mind having a complete separate function
      for unlocked invalidations.
      

    A complete separate function will duplicate lots of page walker
      code. Alternatively we could change amdgpu_vm_pte_fragment, for
      unlocked unmapping, always return frag_end to 2MB (or 1GB), not
      4MB, 8MB.... 

    
    Regards,
    Philip

    
      Regards,
      

      Christian.
      

                entry_end += cursor.pfn &
        ~(entry_end - 1);
        

                  entry_end = min(entry_end, end);