+ return -ENOMEM;
+ }
+
+ kref_init(&vm->task_info->refcount);
+ return 0;
}
/**
@@ -2536,17 +2642,23 @@ void amdgpu_vm_get_task_info(struct
amdgpu_device *adev, u32 pasid,
*/
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
- if (vm->task_info.pid)
+ if (!vm->task_info) {
+ if (amdgpu_vm_create_task_info(vm))
+ /* OOM */
+ return;
+ }
+
+ if (vm->task_info->pid == current->pid)
return;
- vm->task_info.pid = current->pid;
- get_task_comm(vm->task_info.task_name, current);
+ vm->task_info->pid = current->pid;
+ get_task_comm(vm->task_info->task_name, current);
if (current->group_leader->mm != current->mm)
return;
- vm->task_info.tgid = current->group_leader->pid;
- get_task_comm(vm->task_info.process_name, current->group_leader);
+ vm->task_info->tgid = current->group_leader->pid;
+ get_task_comm(vm->task_info->process_name, current->group_leader);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 2cd86d2bf73f..c693772f8942 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -190,10 +190,11 @@ struct amdgpu_vm_pte_funcs {
};
struct amdgpu_task_info {
- char process_name[TASK_COMM_LEN];
- char task_name[TASK_COMM_LEN];
- pid_t pid;
- pid_t tgid;
+ char process_name[TASK_COMM_LEN];
+ char task_name[TASK_COMM_LEN];
+ pid_t pid;
+ pid_t tgid;
+ struct kref refcount;
};
/**
@@ -356,7 +357,7 @@ struct amdgpu_vm {
uint64_t pd_phys_addr;
/* Some basic info about the task */
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
/* Store positions of group of BOs */
struct ttm_lru_bulk_move lru_bulk_move;
@@ -492,8 +493,19 @@ bool amdgpu_vm_need_pipeline_sync(struct
amdgpu_ring *ring,
struct amdgpu_job *job);
void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
- struct amdgpu_task_info *task_info);
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
+
+void amdgpu_vm_put_task_info_pasid(struct amdgpu_device *adev,
+ struct amdgpu_task_info *task_info,
+ u32 pasid);
+
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
+
+void amdgpu_vm_put_task_info_vm(struct amdgpu_task_info *task_info,
+ struct amdgpu_vm *vm);
+
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
bool write_fault);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07..d9e895cb0c10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -1027,7 +1027,7 @@ int amdgpu_vm_ptes_update(struct
amdgpu_vm_update_params *params,
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
min(nptes, 32u), dst, incr,
upd_flags,
- vm->task_info.tgid,
+ vm->task_info ? vm->task_info->tgid : 0,
vm->immediate.fence_context);
amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
cursor.level, pe_start, dst,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index a5a05c16c10d..e00b2a53b192 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
uint32_t status = 0;
u64 addr;
@@ -157,18 +157,26 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
if (!printk_ratelimit())
return 0;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u,
for process %s pid %d thread %s pid %d)\n",
+ entry->vmid_src ? "mmhub" : "gfxhub",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid, task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
+ } else {
+ dev_err(adev->dev,
+ "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, no
process info)\n",
+ entry->vmid_src ? "mmhub" : "gfxhub",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
Can we avoid the duplication here? It's too easy for them to get out
of sync. I think it's OK to change the message format so that the
process into is printed on a separate line. E.g.:
dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
process %s pid %d thread %s pid %d)\n",
+ "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
entry->vmid_src ? "mmhub" : "gfxhub",
entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev, " in process %s pid %d thread %s pid %d\n",
+ task_info.process_name, task_info.tgid,
+ task_info.task_name, task_info.pid);
+ }
+ }
- dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
process %s pid %d thread %s pid %d)\n",
- entry->vmid_src ? "mmhub" : "gfxhub",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
dev_err(adev->dev, " in page starting at address 0x%016llx
from client 0x%x (%s)\n",
- addr, entry->client_id,
- soc15_ih_clientid_name[entry->client_id]);
+ addr, entry->client_id,
+ soc15_ih_clientid_name[entry->client_id]);
if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..3dda6c310729 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -126,19 +126,28 @@ static int gmc_v11_0_process_interrupt(struct
amdgpu_device *adev,
}
if (printk_ratelimit()) {
- struct amdgpu_task_info task_info;
-
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ struct amdgpu_task_info *task_info;
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ "[%s] page fault (src_id:%u ring:%u vmid:%u
pasid:%u, for process %s pid %d thread %s pid %d)\n",
+ entry->vmid_src ? "mmhub" : "gfxhub",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid, task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info,
entry->pasid);
+ } else {
+ dev_err(adev->dev,
+ "[%s] page fault (src_id:%u ring:%u vmid:%u
pasid:%u, no process info)\n",
+ entry->vmid_src ? "mmhub" : "gfxhub",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+ }
Same as above.
- dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u,
for process %s pid %d thread %s pid %d)\n",
- entry->vmid_src ? "mmhub" : "gfxhub",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
dev_err(adev->dev, " in page starting at address 0x%016llx
from client %d\n",
- addr, entry->client_id);
+ addr, entry->client_id);
+
if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index ff4ae73d27ec..aa3887c3bb35 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1444,18 +1444,24 @@ static int gmc_v8_0_process_interrupt(struct
amdgpu_device *adev,
gmc_v8_0_set_fault_enable_default(adev, false);
if (printk_ratelimit()) {
- struct amdgpu_task_info task_info;
-
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ struct amdgpu_task_info *task_info;
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev, "GPU fault detected: %d 0x%08x for
process %s pid %d thread %s pid %d\n",
+ entry->src_id, entry->src_data[0],
task_info->process_name,
+ task_info->tgid, task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info,
entry->pasid);
+ } else {
+ dev_err(adev->dev, "GPU fault detected: %d 0x%08x (no
process info)\n",
+ entry->src_id, entry->src_data[0]);
+ }
Same as above.
- dev_err(adev->dev, "GPU fault detected: %d 0x%08x for
process %s pid %d thread %s pid %d\n",
- entry->src_id, entry->src_data[0], task_info.process_name,
- task_info.tgid, task_info.task_name, task_info.pid);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR
0x%08X\n",
- addr);
+ addr);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS
0x%08X\n",
status);
+
gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
entry->pasid);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2ac5820e9c92..075d633109e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
uint32_t status = 0, cid = 0, rw = 0;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
const char *hub_name;
@@ -626,15 +626,23 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
if (!printk_ratelimit())
return 0;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u,
for process %s pid %d thread %s pid %d)\n",
+ hub_name, retry_fault ? "retry" : "no-retry",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid, task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
+ } else {
+ dev_err(adev->dev,
+ "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u,
no process info)\n",
+ hub_name, retry_fault ? "retry" : "no-retry",
+ entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+ }
Same as above.
- dev_err(adev->dev,
- "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
process %s pid %d thread %s pid %d)\n",
- hub_name, retry_fault ? "retry" : "no-retry",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
dev_err(adev->dev, " in page starting at address 0x%016llx
from IH client 0x%x (%s)\n",
addr, entry->client_id,
soc15_ih_clientid_name[entry->client_id]);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 3d68dd5523c6..515d1a1ff141 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct
amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
int instance;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
u64 addr;
instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
@@ -2116,15 +2116,23 @@ static int sdma_v4_0_print_iv_entry(struct
amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_dbg_ratelimited(adev->dev,
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
+ "pasid:%u, for process %s pid %d thread %s pid %d\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid, task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
+ } else {
+ dev_dbg_ratelimited(adev->dev,
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
+ "pasid:%u, no process info\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+ }
Same as above.
- dev_dbg_ratelimited(adev->dev,
- "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
- "pasid:%u, for process %s pid %d thread %s pid %d\n",
- instance, addr, entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 0f24af6f2810..d7e23bd90f7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1642,7 +1642,7 @@ static int sdma_v4_4_2_print_iv_entry(struct
amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
int instance;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
u64 addr;
instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
@@ -1654,15 +1654,23 @@ static int sdma_v4_4_2_print_iv_entry(struct
amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_dbg_ratelimited(adev->dev,
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
+ "pasid:%u, for process %s pid %d thread %s pid %d\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid, task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
+ } else {
+ dev_dbg_ratelimited(adev->dev,
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
+ "pasid:%u (no process info)\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+ }
Same as above.
Regards,
Felix
- dev_dbg_ratelimited(adev->dev,
- "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
- "pasid:%u, for process %s pid %d thread %s pid %d\n",
- instance, addr, entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index d9953c2b2661..0dfe4b3bd18a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -238,16 +238,16 @@ void
kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t
pasid)
{
- struct amdgpu_task_info task_info;
-
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
- /* Report VM faults from user applications, not retry from
kernel */
- if (!task_info.pid)
- return;
-
- kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
- task_info.pid, task_info.task_name);
+ struct amdgpu_task_info *task_info;
+
+ task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
+ if (task_info) {
+ /* Report VM faults from user applications, not retry from
kernel */
+ if (task_info->pid)
+ kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
+ task_info->pid, task_info->task_name);
+ amdgpu_vm_put_task_info_pasid(dev->adev, task_info, pasid);
+ }
}
void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t
pid,