Walk page table for the faulting address and dump PDEs and PTEs at all levels. Also flag discrepancies where a PDE points to a different address than the next level PDB or PTB BO. v2: * Fix address shift for GFXv8 * Redo PDB/PTB address checking to work on all generations Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 87 ++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 6 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +- 6 files changed, 103 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index bbbf069efb77..78440748c87f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, * This is used to access VRAM that backs a buffer object via MMIO * access for debugging purposes. */ -static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, - unsigned long offset, - void *buf, int len, int write) +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset, + void *buf, int len, int write) { struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo); struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index bccb8c49e597..cffbafffa9d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev); void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable); +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset, + void *buf, int len, int write); int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, uint64_t dst_offset, uint32_t byte_count, struct reservation_object *resv, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1951f2abbdbc..38a2f66cf095 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -544,6 +544,86 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev, amdgpu_vm_pt_continue_dfs((start), (entry)); \ (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor))) +/** + * amdgpu_vm_dump_pte - dump PTEs along a page table walk + * + * @adev: amdgpu device pointer + * @vm: VM address space + * @addr: virtual address + * + * Walks the page table of @vm at the given @addr and prints the PDEs + * and PTEs along the way on a single line. + */ +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm, + uint64_t addr) +{ + static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", "PTE"}; + static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", "PTB"}; + struct amdgpu_vm_pt_cursor cursor; + uint64_t last_pde; + char buf[128]; + int i = 0; + + { + uint64_t last_flags = AMDGPU_PTE_VALID; + uint64_t last_addr = vm->root.base.bo->tbo.offset; + + amdgpu_gmc_get_vm_pde(adev, adev->vm_manager.root_level, + &last_addr, &last_flags); + last_pde = last_addr | last_flags; + } + + amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor); + + do { + unsigned int mask, shift, idx; + struct amdgpu_bo *bo; + uint64_t pte; + + mask = amdgpu_vm_entries_mask(adev, cursor.level); + shift = amdgpu_vm_level_shift(adev, cursor.level); + idx = (cursor.pfn >> shift) & mask; + + bo = cursor.entry->base.bo; + if (bo) { + uint64_t this_flags = AMDGPU_PTE_VALID; + uint64_t this_addr = bo->tbo.offset; + + /* Flag discrepancy between previous level PDE + * and the actual address of this PTB or PDB. + */ + amdgpu_gmc_get_vm_pde(adev, cursor.level, + &this_addr, &this_flags); + if ((this_addr | this_flags) != last_pde) + i += snprintf(buf + i, sizeof(buf) - i, "!"); + + amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte), + &pte, sizeof(pte), false); + i += snprintf(buf + i, sizeof(buf) - i, + "%s[%d]=0x%llx ", + level_entry[cursor.level], idx, pte); + last_pde = pte; + } else { + /* Flag discrepancy if previous level PDE had + * a valid entry but there is no PTB or PDB BO. + */ + if ((last_pde & AMDGPU_PTE_VALID) && + !(last_pde & AMDGPU_PDE_PTE)) + i += snprintf(buf + i, sizeof(buf) - i, "!"); + i += snprintf(buf + i, sizeof(buf) - i, + "no %s ", level_block[cursor.level]); + last_pde = 0; + } + + ++cursor.level; + cursor.parent = cursor.entry; + if (!cursor.entry->entries) + break; + cursor.entry = &cursor.entry->entries[idx]; + } while (cursor.entry); + dev_err(adev->dev, "%s", buf); +} + /** * amdgpu_vm_get_pd_bo - add the VM PD to a validation list * @@ -3081,8 +3161,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * @pasid: PASID identifier for VM * @task_info: task_info to fill. */ -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, - struct amdgpu_task_info *task_info) +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev, + unsigned int pasid, + struct amdgpu_task_info *task_info) { struct amdgpu_vm *vm; unsigned long flags; @@ -3094,6 +3175,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, *task_info = vm->task_info; spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags); + + return vm; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 489a162ca620..6a8b833d180e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid); void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm, + uint64_t addr); void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, struct list_head *validated, struct amdgpu_bo_list_entry *entry); @@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, struct amdgpu_job *job); void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, - struct amdgpu_task_info *task_info); +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev, + unsigned int pasid, + struct amdgpu_task_info *task_info); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index 8bf2ba310fd9..18207ecfd85c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, if (printk_ratelimit()) { struct amdgpu_task_info task_info; + struct amdgpu_vm *vm; memset(&task_info, 0, sizeof(struct amdgpu_task_info)); - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); + vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n", entry->src_id, entry->src_data[0], task_info.process_name, @@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, status); gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client, entry->pasid); + if (vm) + amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr + << AMDGPU_GPU_PAGE_SHIFT); } vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index bd5d36944481..f27e88af4016 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, if (printk_ratelimit()) { struct amdgpu_task_info task_info; + struct amdgpu_vm *vm; memset(&task_info, 0, sizeof(struct amdgpu_task_info)); - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); + vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); dev_err(adev->dev, "[%s] %s page fault (src_id:%u ring:%u vmid:%u " @@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", status); + if (vm) + amdgpu_vm_dump_pte(adev, vm, addr); } return 0; -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx