Re: [PATCH 2/3] drm/amdkfd: handle VMA remove race

Felix Kuehling <felix.kuehling@xxxxxxx> · Thu, 18 Nov 2021 10:07:22 -0500

Am 2021-11-18 um 10:00 a.m. schrieb philip yang:
>
>
> On 2021-11-17 7:10 p.m., Felix Kuehling wrote:
>> On 2021-11-16 10:43 p.m., Philip Yang wrote:
>>> VMA may be removed before unmap notifier callback, restore pages take
>>> mmap write lock to lookup VMA to avoid race,
>>
>> The old code looked up the VMA after taking the mmap lock (either
>> read or write) and kept holding the lock afterwards. I think even
>> with your new code it's possible that the VMA disappears before you
>> take the lock the first time, so always taking the write lock only
>> reduces the time window in which things can go wrong, but it doesn't
>> remove the race.
> Take mmap write lock will serialize with __do_munmap,

__do_munmap runs with the mmap write lock. Taking the read lock should
be sufficient to serialize with it.

Regards,
  Felix

> to ensure vma remove and unmap callback are done, because unmap
> callback set drain_retryfaults flag, so we can safely drain the
> faults, and it is app bug if vma not found after taking mmap write lock.
>>
>> I still struggle to understand the race you're trying to fix. The
>> only time the svm_restore_pages can see that the VMA is gone AND the
>> prange is gone is after the deferred worker has removed the prange.
>> But the fault draining in the deferred worker should prevent us from
>> ever seeing stale faults in that situation. That means, if no prange
>> is found and no VMA is found, it's definitely an application bug.
>>
>> The only possible race is in the case where the prange still exists
>> but the VMA is gone (needed by svm_fault_allowed). We can treat that
>> as a special case where we just return success because we know that
>> we're handling a stale fault for a VMA that's in the middle of being
>> unmapped. The fact that the prange still existed means that there
>> once was a valid mapping at the address but the deferred worker just
>> hasn't had a chance to clean it up yet.
>>
> Yes, that is only possible race.
>> One more comment inline.
>>
>>
>>>   and then create unregister
>>> new range and check VMA access permission, then downgrade to take mmap
>>> read lock to recover fault. Refactor code to avoid duplicate VMA
>>> lookup.
>>>
>>> Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 65
>>> ++++++++++------------------
>>>   1 file changed, 24 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> index c1f367934428..3eb0a9491755 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> @@ -2329,20 +2329,13 @@ svm_range_best_restore_location(struct
>>> svm_range *prange,
>>>   }
>>>     static int
>>> -svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
>>> -                   unsigned long *start, unsigned long *last,
>>> -                   bool *is_heap_stack)
>>> +svm_range_get_range_boundaries(struct kfd_process *p, struct
>>> vm_area_struct *vma,
>>> +                   int64_t addr, unsigned long *start,
>>> +                   unsigned long *last, bool *is_heap_stack)
>>>   {
>>> -    struct vm_area_struct *vma;
>>>       struct interval_tree_node *node;
>>>       unsigned long start_limit, end_limit;
>>>   -    vma = find_vma(p->mm, addr << PAGE_SHIFT);
>>> -    if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
>>> -        pr_debug("VMA does not exist in address [0x%llx]\n", addr);
>>> -        return -EFAULT;
>>> -    }
>>> -
>>>       *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk &&
>>>                 vma->vm_end >= vma->vm_mm->start_brk) ||
>>>                (vma->vm_start <= vma->vm_mm->start_stack &&
>>> @@ -2437,9 +2430,10 @@ svm_range_check_vm_userptr(struct kfd_process
>>> *p, uint64_t start, uint64_t last,
>>>     static struct
>>>   svm_range *svm_range_create_unregistered_range(struct
>>> amdgpu_device *adev,
>>> -                        struct kfd_process *p,
>>> -                        struct mm_struct *mm,
>>> -                        int64_t addr)
>>> +                           struct kfd_process *p,
>>> +                           struct mm_struct *mm,
>>> +                           struct vm_area_struct *vma,
>>> +                           int64_t addr)
>>>   {
>>>       struct svm_range *prange = NULL;
>>>       unsigned long start, last;
>>> @@ -2449,7 +2443,7 @@ svm_range
>>> *svm_range_create_unregistered_range(struct amdgpu_device *adev,
>>>       uint64_t bo_l = 0;
>>>       int r;
>>>   -    if (svm_range_get_range_boundaries(p, addr, &start, &last,
>>> +    if (svm_range_get_range_boundaries(p, vma, addr, &start, &last,
>>>                          &is_heap_stack))
>>>           return NULL;
>>>   @@ -2552,20 +2546,13 @@ svm_range_count_fault(struct amdgpu_device
>>> *adev, struct kfd_process *p,
>>>   }
>>>     static bool
>>> -svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool
>>> write_fault)
>>> +svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
>>>   {
>>>       unsigned long requested = VM_READ;
>>> -    struct vm_area_struct *vma;
>>>         if (write_fault)
>>>           requested |= VM_WRITE;
>>>   -    vma = find_vma(mm, addr << PAGE_SHIFT);
>>> -    if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
>>> -        pr_debug("address 0x%llx VMA is removed\n", addr);
>>> -        return true;
>>> -    }
>>> -
>>>       pr_debug("requested 0x%lx, vma permission flags 0x%lx\n",
>>> requested,
>>>           vma->vm_flags);
>>>       return (vma->vm_flags & requested) == requested;
>>> @@ -2582,7 +2569,7 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>>       uint64_t timestamp;
>>>       int32_t best_loc;
>>>       int32_t gpuidx = MAX_GPU_INSTANCE;
>>> -    bool write_locked = false;
>>> +    struct vm_area_struct *vma = NULL;
>>>       int r = 0;
>>>         if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
>>> @@ -2606,26 +2593,22 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>>         /* mm is available because kfd_process_notifier_release
>>> drain fault */
>>>       mm = p->mm;
>>> +    mmap_write_lock(mm);
>>
>> Always taking the write lock is unnecessary. I think we can keep the
>> old strategy of retrying with the write lock only when necessary. I
>> think this should work correctly as long as you lookup the VMA every
>> time after taking either the mmap read or write lock. The vma you
>> looked up should be valid as long as you hold that lock.
>>
>> As I pointed out above, if svm_range_from_addr finds a prange but the
>> VMA is missing, we can treat that as a special case and return
>> success (just draining a stale fault on a VMA that's being unmapped).
>
> ok, I will change svm_fault_allowed to return success if VMA is
> missing, it is simpler to handle this special race case, without
> taking mmap write lock.
>
> Regards,
>
> Philip
>
>>
>> Regards,
>>   Felix
>>
>>
>>> +
>>> +    vma = find_vma(p->mm, addr << PAGE_SHIFT);
>>> +    if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
>>> +        pr_debug("VMA not found for address 0x%llx\n", addr);
>>> +        mmap_write_downgrade(mm);
>>> +        r = -EFAULT;
>>> +        goto out_unlock_mm;
>>> +    }
>>>   -    mmap_read_lock(mm);
>>> -retry_write_locked:
>>>       mutex_lock(&svms->lock);
>>>       prange = svm_range_from_addr(svms, addr, NULL);
>>>       if (!prange) {
>>>           pr_debug("failed to find prange svms 0x%p address
>>> [0x%llx]\n",
>>>                svms, addr);
>>> -        if (!write_locked) {
>>> -            /* Need the write lock to create new range with MMU
>>> notifier.
>>> -             * Also flush pending deferred work to make sure the
>>> interval
>>> -             * tree is up to date before we add a new range
>>> -             */
>>> -            mutex_unlock(&svms->lock);
>>> -            mmap_read_unlock(mm);
>>> -            mmap_write_lock(mm);
>>> -            write_locked = true;
>>> -            goto retry_write_locked;
>>> -        }
>>> -        prange = svm_range_create_unregistered_range(adev, p, mm,
>>> addr);
>>> +        prange = svm_range_create_unregistered_range(adev, p, mm,
>>> vma, addr);
>>>           if (!prange) {
>>>               pr_debug("failed to create unregistered range svms
>>> 0x%p address [0x%llx]\n",
>>>                    svms, addr);
>>> @@ -2634,8 +2617,8 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>>               goto out_unlock_svms;
>>>           }
>>>       }
>>> -    if (write_locked)
>>> -        mmap_write_downgrade(mm);
>>> +
>>> +    mmap_write_downgrade(mm);
>>>         mutex_lock(&prange->migrate_mutex);
>>>   @@ -2652,7 +2635,7 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>>           goto out_unlock_range;
>>>       }
>>>   -    if (!svm_fault_allowed(mm, addr, write_fault)) {
>>> +    if (!svm_fault_allowed(vma, write_fault)) {
>>>           pr_debug("fault addr 0x%llx no %s permission\n", addr,
>>>               write_fault ? "write" : "read");
>>>           r = -EPERM;
>>> @@ -2704,10 +2687,10 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>>       mutex_unlock(&prange->migrate_mutex);
>>>   out_unlock_svms:
>>>       mutex_unlock(&svms->lock);
>>> +out_unlock_mm:
>>>       mmap_read_unlock(mm);
>>>         svm_range_count_fault(adev, p, gpuidx);
>>> -
>>>   out:
>>>       kfd_unref_process(p);
>>>