On 2021-07-06 9:08 p.m., Felix Kuehling
wrote:
Am 2021-07-06 um 11:36 a.m. schrieb Colin Ian King:Hi, Static analysis with Coverity on linux-next has found a potential null pointer dereference in function svm_range_restore_pages in drivers/gpu/drm/amd/amdkfd/kfd_svm.c from the following commit: commit d4ebc2007040a0aff01bfe1b194085d3867328fd Author: Philip Yang <Philip.Yang@xxxxxxx> Date: Tue Jun 22 00:12:32 2021 -0400 drm/amdkfd: implement counters for vm fault and migration The analysis is as follows:Thanks. Philip, see inline ...2397 int 2398 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2399 uint64_t addr) 2400{ 2401 struct mm_struct *mm = NULL; 2402 struct svm_range_list *svms; 2403 struct svm_range *prange; 2404 struct kfd_process *p; 2405 uint64_t timestamp; 2406 int32_t best_loc; 2407 int32_t gpuidx = MAX_GPU_INSTANCE; 2408 bool write_locked = false; 2409 int r = 0; 2410 1. Condition !(adev->kfd.dev->pgmap.type != 0), taking false branch. 2411 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2412 pr_debug("device does not support SVM\n"); 2413 return -EFAULT; 2414 } 2415 2416 p = kfd_lookup_process_by_pasid(pasid); 2. Condition !p, taking false branch. 2417 if (!p) { 2418 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2419 return -ESRCH; 2420 } 3. Condition !p->xnack_enabled, taking false branch. 2421 if (!p->xnack_enabled) { 2422 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2423 return -EFAULT; 2424 } 2425 svms = &p->svms; 2426 4. Condition 0 /* __builtin_types_compatible_p() */, taking false branch. 5. Condition 1 /* __builtin_types_compatible_p() */, taking true branch. 6. Falling through to end of if statement. 7. Condition !!branch, taking false branch. 8. Condition ({...; !!branch;}), taking false branch. 2427 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2428 2429 mm = get_task_mm(p->lead_thread); 9. Condition !mm, taking false branch. 2430 if (!mm) { 2431 pr_debug("svms 0x%p failed to get mm\n", svms); 2432 r = -ESRCH; 2433 goto out; 2434 } 2435 2436 mmap_read_lock(mm); 2437retry_write_locked: 2438 mutex_lock(&svms->lock); 2439 prange = svm_range_from_addr(svms, addr, NULL); 10. Condition !prange, taking true branch. 18. Condition !prange, taking true branch. 2440 if (!prange) { 11. Condition 0 /* __builtin_types_compatible_p() */, taking false branch. 12. Condition 1 /* __builtin_types_compatible_p() */, taking true branch. 13. Falling through to end of if statement. 14. Condition !!branch, taking false branch. 15. Condition ({...; !!branch;}), taking false branch. 19. Condition 0 /* __builtin_types_compatible_p() */, taking false branch. 20. Condition 1 /* __builtin_types_compatible_p() */, taking true branch. 21. Falling through to end of if statement. 22. Condition !!branch, taking false branch. 23. Condition ({...; !!branch;}), taking false branch. 2441 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2442 svms, addr); 16. Condition !write_locked, taking true branch. 24. Condition !write_locked, taking false branch. 2443 if (!write_locked) { 2444 /* Need the write lock to create new range with MMU notifier. 2445 * Also flush pending deferred work to make sure the interval 2446 * tree is up to date before we add a new range 2447 */ 2448 mutex_unlock(&svms->lock); 2449 mmap_read_unlock(mm); 2450 mmap_write_lock(mm); 2451 write_locked = true; 17. Jumping to label retry_write_locked. 2452 goto retry_write_locked; 2453 } 2454 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 25. Condition !prange, taking true branch. 26. var_compare_op: Comparing prange to null implies that prange might be null. 2455 if (!prange) { 27. Condition 0 /* __builtin_types_compatible_p() */, taking false branch. 28. Condition 1 /* __builtin_types_compatible_p() */, taking true branch. 29. Falling through to end of if statement. 30. Condition !!branch, taking false branch. 31. Condition ({...; !!branch;}), taking false branch. 2456 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2457 svms, addr); 2458 mmap_write_downgrade(mm); 2459 r = -EFAULT; 32. Jumping to label out_unlock_svms. 2460 goto out_unlock_svms; 2461 } 2462 } 2463 if (write_locked) 2464 mmap_write_downgrade(mm); 2465 2466 mutex_lock(&prange->migrate_mutex); 2467 2468 if (svm_range_skip_recover(prange)) { 2469 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2470 goto out_unlock_range; 2471 } 2472 2473 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; 2474 /* skip duplicate vm fault on different pages of same range */ 2475 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { 2476 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2477 svms, prange->start, prange->last); 2478 goto out_unlock_range; 2479 } 2480 2481 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2482 if (best_loc == -1) { 2483 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2484 svms, prange->start, prange->last); 2485 r = -EACCES; 2486 goto out_unlock_range; 2487 } 2488 2489 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2490 svms, prange->start, prange->last, best_loc, 2491 prange->actual_loc); 2492 2493 if (prange->actual_loc != best_loc) { 2494 if (best_loc) { 2495 r = svm_migrate_to_vram(prange, best_loc, mm); 2496 if (r) { 2497 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2498 r, addr); 2499 /* Fallback to system memory if migration to 2500 * VRAM failed 2501 */ 2502 if (prange->actual_loc) 2503 r = svm_migrate_vram_to_ram(prange, mm); 2504 else 2505 r = 0; 2506 } 2507 } else { 2508 r = svm_migrate_vram_to_ram(prange, mm); 2509 } 2510 if (r) { 2511 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2512 r, svms, prange->start, prange->last); 2513 goto out_unlock_range; 2514 } 2515 } 2516 2517 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false); 2518 if (r) 2519 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2520 r, svms, prange->start, prange->last); 2521 2522out_unlock_range: 2523 mutex_unlock(&prange->migrate_mutex); 2524out_unlock_svms: 2525 mutex_unlock(&svms->lock); 2526 mmap_read_unlock(mm); 2527 Dereference after null check (FORWARD_NULL) 33. var_deref_model: Passing null pointer prange to svm_range_count_fault, which dereferences it. 2528 svm_range_count_fault(adev, p, prange, gpuidx);Looks like you need to add a NULL check for prange here.
yes, prange will be NULL if fault is on invalid virtual address, we still need report and count the fault. I will send out patch to fix this case.
Thanks,
Philip
Regards, FelixThe jump in line 2460 to out_unlock_svms will occur if prange is null, however, calling svm_range_count_fault with a null prange will cause a null pointer deference when it calls svm_range_get_pdd_by_adev and dereferences the pointer as follows: p = container_of(prange->svms, struct kfd_process, svms); Colin