Am 2021-11-18 um 10:00 a.m. schrieb philip yang: > > > On 2021-11-17 7:10 p.m., Felix Kuehling wrote: >> On 2021-11-16 10:43 p.m., Philip Yang wrote: >>> VMA may be removed before unmap notifier callback, restore pages take >>> mmap write lock to lookup VMA to avoid race, >> >> The old code looked up the VMA after taking the mmap lock (either >> read or write) and kept holding the lock afterwards. I think even >> with your new code it's possible that the VMA disappears before you >> take the lock the first time, so always taking the write lock only >> reduces the time window in which things can go wrong, but it doesn't >> remove the race. > Take mmap write lock will serialize with __do_munmap, __do_munmap runs with the mmap write lock. Taking the read lock should be sufficient to serialize with it. Regards, Felix > to ensure vma remove and unmap callback are done, because unmap > callback set drain_retryfaults flag, so we can safely drain the > faults, and it is app bug if vma not found after taking mmap write lock. >> >> I still struggle to understand the race you're trying to fix. The >> only time the svm_restore_pages can see that the VMA is gone AND the >> prange is gone is after the deferred worker has removed the prange. >> But the fault draining in the deferred worker should prevent us from >> ever seeing stale faults in that situation. That means, if no prange >> is found and no VMA is found, it's definitely an application bug. >> >> The only possible race is in the case where the prange still exists >> but the VMA is gone (needed by svm_fault_allowed). We can treat that >> as a special case where we just return success because we know that >> we're handling a stale fault for a VMA that's in the middle of being >> unmapped. The fact that the prange still existed means that there >> once was a valid mapping at the address but the deferred worker just >> hasn't had a chance to clean it up yet. >> > Yes, that is only possible race. >> One more comment inline. >> >> >>> and then create unregister >>> new range and check VMA access permission, then downgrade to take mmap >>> read lock to recover fault. Refactor code to avoid duplicate VMA >>> lookup. >>> >>> Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> >>> --- >>> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 65 >>> ++++++++++------------------ >>> 1 file changed, 24 insertions(+), 41 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c >>> index c1f367934428..3eb0a9491755 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c >>> @@ -2329,20 +2329,13 @@ svm_range_best_restore_location(struct >>> svm_range *prange, >>> } >>> static int >>> -svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, >>> - unsigned long *start, unsigned long *last, >>> - bool *is_heap_stack) >>> +svm_range_get_range_boundaries(struct kfd_process *p, struct >>> vm_area_struct *vma, >>> + int64_t addr, unsigned long *start, >>> + unsigned long *last, bool *is_heap_stack) >>> { >>> - struct vm_area_struct *vma; >>> struct interval_tree_node *node; >>> unsigned long start_limit, end_limit; >>> - vma = find_vma(p->mm, addr << PAGE_SHIFT); >>> - if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { >>> - pr_debug("VMA does not exist in address [0x%llx]\n", addr); >>> - return -EFAULT; >>> - } >>> - >>> *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && >>> vma->vm_end >= vma->vm_mm->start_brk) || >>> (vma->vm_start <= vma->vm_mm->start_stack && >>> @@ -2437,9 +2430,10 @@ svm_range_check_vm_userptr(struct kfd_process >>> *p, uint64_t start, uint64_t last, >>> static struct >>> svm_range *svm_range_create_unregistered_range(struct >>> amdgpu_device *adev, >>> - struct kfd_process *p, >>> - struct mm_struct *mm, >>> - int64_t addr) >>> + struct kfd_process *p, >>> + struct mm_struct *mm, >>> + struct vm_area_struct *vma, >>> + int64_t addr) >>> { >>> struct svm_range *prange = NULL; >>> unsigned long start, last; >>> @@ -2449,7 +2443,7 @@ svm_range >>> *svm_range_create_unregistered_range(struct amdgpu_device *adev, >>> uint64_t bo_l = 0; >>> int r; >>> - if (svm_range_get_range_boundaries(p, addr, &start, &last, >>> + if (svm_range_get_range_boundaries(p, vma, addr, &start, &last, >>> &is_heap_stack)) >>> return NULL; >>> @@ -2552,20 +2546,13 @@ svm_range_count_fault(struct amdgpu_device >>> *adev, struct kfd_process *p, >>> } >>> static bool >>> -svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool >>> write_fault) >>> +svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) >>> { >>> unsigned long requested = VM_READ; >>> - struct vm_area_struct *vma; >>> if (write_fault) >>> requested |= VM_WRITE; >>> - vma = find_vma(mm, addr << PAGE_SHIFT); >>> - if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { >>> - pr_debug("address 0x%llx VMA is removed\n", addr); >>> - return true; >>> - } >>> - >>> pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", >>> requested, >>> vma->vm_flags); >>> return (vma->vm_flags & requested) == requested; >>> @@ -2582,7 +2569,7 @@ svm_range_restore_pages(struct amdgpu_device >>> *adev, unsigned int pasid, >>> uint64_t timestamp; >>> int32_t best_loc; >>> int32_t gpuidx = MAX_GPU_INSTANCE; >>> - bool write_locked = false; >>> + struct vm_area_struct *vma = NULL; >>> int r = 0; >>> if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { >>> @@ -2606,26 +2593,22 @@ svm_range_restore_pages(struct amdgpu_device >>> *adev, unsigned int pasid, >>> /* mm is available because kfd_process_notifier_release >>> drain fault */ >>> mm = p->mm; >>> + mmap_write_lock(mm); >> >> Always taking the write lock is unnecessary. I think we can keep the >> old strategy of retrying with the write lock only when necessary. I >> think this should work correctly as long as you lookup the VMA every >> time after taking either the mmap read or write lock. The vma you >> looked up should be valid as long as you hold that lock. >> >> As I pointed out above, if svm_range_from_addr finds a prange but the >> VMA is missing, we can treat that as a special case and return >> success (just draining a stale fault on a VMA that's being unmapped). > > ok, I will change svm_fault_allowed to return success if VMA is > missing, it is simpler to handle this special race case, without > taking mmap write lock. > > Regards, > > Philip > >> >> Regards, >> Felix >> >> >>> + >>> + vma = find_vma(p->mm, addr << PAGE_SHIFT); >>> + if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { >>> + pr_debug("VMA not found for address 0x%llx\n", addr); >>> + mmap_write_downgrade(mm); >>> + r = -EFAULT; >>> + goto out_unlock_mm; >>> + } >>> - mmap_read_lock(mm); >>> -retry_write_locked: >>> mutex_lock(&svms->lock); >>> prange = svm_range_from_addr(svms, addr, NULL); >>> if (!prange) { >>> pr_debug("failed to find prange svms 0x%p address >>> [0x%llx]\n", >>> svms, addr); >>> - if (!write_locked) { >>> - /* Need the write lock to create new range with MMU >>> notifier. >>> - * Also flush pending deferred work to make sure the >>> interval >>> - * tree is up to date before we add a new range >>> - */ >>> - mutex_unlock(&svms->lock); >>> - mmap_read_unlock(mm); >>> - mmap_write_lock(mm); >>> - write_locked = true; >>> - goto retry_write_locked; >>> - } >>> - prange = svm_range_create_unregistered_range(adev, p, mm, >>> addr); >>> + prange = svm_range_create_unregistered_range(adev, p, mm, >>> vma, addr); >>> if (!prange) { >>> pr_debug("failed to create unregistered range svms >>> 0x%p address [0x%llx]\n", >>> svms, addr); >>> @@ -2634,8 +2617,8 @@ svm_range_restore_pages(struct amdgpu_device >>> *adev, unsigned int pasid, >>> goto out_unlock_svms; >>> } >>> } >>> - if (write_locked) >>> - mmap_write_downgrade(mm); >>> + >>> + mmap_write_downgrade(mm); >>> mutex_lock(&prange->migrate_mutex); >>> @@ -2652,7 +2635,7 @@ svm_range_restore_pages(struct amdgpu_device >>> *adev, unsigned int pasid, >>> goto out_unlock_range; >>> } >>> - if (!svm_fault_allowed(mm, addr, write_fault)) { >>> + if (!svm_fault_allowed(vma, write_fault)) { >>> pr_debug("fault addr 0x%llx no %s permission\n", addr, >>> write_fault ? "write" : "read"); >>> r = -EPERM; >>> @@ -2704,10 +2687,10 @@ svm_range_restore_pages(struct amdgpu_device >>> *adev, unsigned int pasid, >>> mutex_unlock(&prange->migrate_mutex); >>> out_unlock_svms: >>> mutex_unlock(&svms->lock); >>> +out_unlock_mm: >>> mmap_read_unlock(mm); >>> svm_range_count_fault(adev, p, gpuidx); >>> - >>> out: >>> kfd_unref_process(p); >>>