Next step towards HMM support. For now just silence the retry fault and optionally redirect the request to the dummy page. v2: make sure the VM is not destroyed while we handle the fault. v3: fix VM destroy check, cleanup comments Signed-off-by: Christian König <christian.koenig@xxxxxxx> Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 73 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++ 3 files changed, 79 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index db13ddd70a8d..48349e4f0701 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -3125,3 +3125,76 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) } } } + +/** + * amdgpu_vm_handle_fault - graceful handling of VM faults. + * @adev: amdgpu device pointer + * @pasid: PASID of the VM + * @addr: Address of the fault + * + * Try to gracefully handle a VM fault. Return true if the fault was handled and + * shouldn't be reported any more. + */ +bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid, + uint64_t addr) +{ + struct amdgpu_bo *root; + uint64_t value, flags; + struct amdgpu_vm *vm; + long r; + + spin_lock(&adev->vm_manager.pasid_lock); + vm = idr_find(&adev->vm_manager.pasid_idr, pasid); + if (vm) + root = amdgpu_bo_ref(vm->root.base.bo); + else + root = NULL; + spin_unlock(&adev->vm_manager.pasid_lock); + + if (!root) + return false; + + r = amdgpu_bo_reserve(root, true); + if (r) + goto error_unref; + + /* Double check that the VM still exists */ + spin_lock(&adev->vm_manager.pasid_lock); + vm = idr_find(&adev->vm_manager.pasid_idr, pasid); + if (vm && vm->root.base.bo != root) + vm = NULL; + spin_unlock(&adev->vm_manager.pasid_lock); + if (!vm) + goto error_unlock; + + addr /= AMDGPU_GPU_PAGE_SIZE; + flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SNOOPED | + AMDGPU_PTE_SYSTEM; + + if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) { + /* Redirect the access to the dummy page */ + value = adev->dummy_page_addr; + flags |= AMDGPU_PTE_EXECUTABLE | AMDGPU_PTE_READABLE | + AMDGPU_PTE_WRITEABLE; + } else { + /* Let the hw retry silently on the PTE */ + value = 0; + } + + r = amdgpu_vm_bo_update_mapping(adev, vm, true, NULL, addr, addr + 1, + flags, value, NULL, NULL); + if (r) + goto error_unlock; + + r = amdgpu_vm_update_pdes(adev, vm, true); + +error_unlock: + amdgpu_bo_unreserve(root); + if (r < 0) + DRM_ERROR("Can't handle page fault (%ld)\n", r); + +error_unref: + amdgpu_bo_unref(&root); + + return false; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 0a97dc839f3b..4dbbe1b6b413 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -413,6 +413,8 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, struct amdgpu_task_info *task_info); +bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid, + uint64_t addr); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 0d06c7901d26..ad19c8194a73 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -375,6 +375,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, } /* If it's the first fault for this address, process it normally */ + if (retry_fault && !in_interrupt() && + amdgpu_vm_handle_fault(adev, entry->pasid, addr)) + return 1; /* This also prevents sending it to KFD */ + if (!amdgpu_sriov_vf(adev)) { /* * Issue a dummy read to wait for the status register to -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx