If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit set, but the driver select xnack on or off path depending on per process xnack setting which is also used to set qpd mem_config xnack on or off if KFD_SUPPORT_XNACK_PER_PROCESS. If process is xnack on, then GPU page fault show retry page fault message, otherwise show no-retry page fault message, to avoid misleading when debugging application page fault issue. The process lookup from pasid is done inside retry fault handler svm_range_restore_pages, add xnack_on parameter to pass process xnack setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler to show vm fault message. Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 ++++--- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 64ddc87f7fb6..58f7ab193027 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * GFX 9.4.3. * @addr: Address of the fault * @write_fault: true is write fault, false is read fault + * @xnack_on: return value, true if the process sets xnack on * * Try to gracefully handle a VM fault. Return true if the fault was handled and * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault) + bool write_fault, bool *xnack_on) { bool is_compute_context = false; struct amdgpu_bo *root; @@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, write_fault, xnack_on)) { amdgpu_bo_unref(&root); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index bc71b44387b2..7f364f0b9a60 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault); + bool write_fault, bool *xnack_on); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..2f0752376236 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault, NULL)) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 671a6766df5b..3db0f2304b6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, uint32_t cam_index = 0; int ret, xcc_id = 0; uint32_t node_id; + bool xnack_on = false; node_id = entry->node_id; @@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault); + addr, write_fault, &xnack_on); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, write_fault)) + addr, write_fault, &xnack_on)) return 1; } } @@ -628,7 +629,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name, - retry_fault ? "retry" : "no-retry", + (retry_fault && xnack_on) ? "retry" : "no-retry", entry->src_id, entry->ring_id, entry->vmid, entry->pasid); task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 234ea0fbfa0c..9d44a52bc4b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2887,7 +2887,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, - uint64_t addr, bool write_fault) + uint64_t addr, bool write_fault, bool *xnack_on) { unsigned long start, last, size; struct mm_struct *mm = NULL; @@ -2923,6 +2923,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } + if (xnack_on) + *xnack_on = p->xnack_enabled; if (!p->xnack_enabled) { pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); r = -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 22f22b06a2f4..402f6fbb6452 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -182,7 +182,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, void svm_range_vram_node_free(struct svm_range *prange); int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, uint64_t addr, - bool write_fault); + bool write_fault, bool *xnack_on); int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence); void svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, struct mm_struct *mm, -- 2.43.2