When debugging VMC page fault and ring hang issues, the detailed process information is quite helpful, especially when the issue can only be reproduced after a very long time running. With this information, only run the specific sub-testcase may also will reproduce the issue, which may save a lot of time for debugging. With this patch, the process information is similar as following. When VMC page fault issue happened: [ 142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 ring:171 vmid:2 pasid:32769), [ 142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent, [ 142.978693] amdgpu 0000:00:08.0: in page starting at address 0x0000000000000000 from 27 When ring hang issue happened: [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572 [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below: [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4 Signed-off-by: Trigger Huang <Trigger.Huang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 ++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 11 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 11 ++++++----- 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 1c49b82..1a2d0c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs p->job->uf_addr = uf_offset; kfree(chunk_array); - /* Use this opportunity to fill in task info for the vm */ - amdgpu_vm_set_task_info(vm); - return 0; free_all_kdata: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e0af44f..c75ecb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), ring->fence_drv.sync_seq); + if (job->vm) { + struct amdgpu_task_info *ti = &(job->vm->task_info); + + DRM_ERROR("The job's process information is as below:\n"); + DRM_ERROR("Process %s, thread %s, cmd line:%s\n", + ti->process_name, ti->task_name, ti->cmd_line); + } + if (amdgpu_device_should_recover_gpu(ring->adev)) amdgpu_device_gpu_recover(ring->adev, job); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index e73d152..24f3cbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -30,6 +30,7 @@ #include <linux/idr.h> #include <drm/drmP.h> #include <drm/amdgpu_drm.h> +#include <linux/string_helpers.h> #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, goto error_free_root; vm->pasid = pasid; + + amdgpu_vm_set_task_info(vm); } vm->fault_hash = init_fault_hash(); @@ -3223,6 +3226,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags); idr_remove(&adev->vm_manager.pasid_idr, vm->pasid); spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags); + + kfree(vm->task_info.cmd_line); + vm->task_info.cmd_line = NULL; } kfree(vm->fault_hash); @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) vm->task_info.tgid = current->group_leader->pid; get_task_comm(vm->task_info.process_name, current->group_leader); } + + vm->task_info.cmd_line = + kstrdup_quotable_cmdline(current, GFP_KERNEL); + if (!vm->task_info.cmd_line) + DRM_DEBUG_DRIVER("Failed to get cmdline!\n"); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index e8dcfd5..9fab787 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs { struct amdgpu_task_info { char process_name[TASK_COMM_LEN]; char task_name[TASK_COMM_LEN]; + char *cmd_line; pid_t pid; pid_t tgid; }; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index bacdaef..c3e3558 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); dev_err(adev->dev, - "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n", - entry->vmid_src ? "mmhub" : "gfxhub", - entry->src_id, entry->ring_id, entry->vmid, - entry->pasid, task_info.process_name, task_info.tgid, - task_info.task_name, task_info.pid); + "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ", + entry->vmid_src ? "mmhub" : "gfxhub", entry->src_id, + entry->ring_id, entry->vmid, entry->pasid); + dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,", + task_info.process_name, task_info.tgid, + task_info.task_name, task_info.pid, task_info.cmd_line); dev_err(adev->dev, " in page starting at address 0x%016llx from %d\n", addr, entry->client_id); if (!amdgpu_sriov_vf(adev)) -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx