drm/amdgpu: change vm->task_info handling
This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
reference counted.
- introducing two new helper funcs for task_info lifecycle management
- amdgpu_vm_get_task_info: reference counts up task_info before
returning this info
- amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.
This patch also does logistical changes required for existing usage
of vm->task_info.
Cc: Christian Koenig <christian.koenig@xxxxxxx>
Cc: Alex Deucher <alexander.deucher@xxxxxxx>
Cc: Felix Kuehling <Felix.Kuehling@xxxxxxx>
Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 7 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 17 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 142 +++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 24 +++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 27 ++--
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 28 ++--
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 26 ++--
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 28 ++--
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 20 +--
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 19 +--
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17 +--
13 files changed, 259 insertions(+), 113 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a4faea4fa0b5..111f8afb03a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1763,9 +1763,12 @@ static int amdgpu_debugfs_vm_info_show(struct
seq_file *m, void *unused)
list_for_each_entry(file, &dev->filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
+ struct amdgpu_task_info *ti;
+
+ ti = amdgpu_vm_get_task_info_vm(vm);
+ seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid,
ti->process_name);
+ amdgpu_vm_put_task_info_vm(ti, vm);
- seq_printf(m, "pid:%d\tProcess:%s ----------\n",
- vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true);
if (r)
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b8356699f23..00516fa178b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4952,10 +4952,17 @@ int amdgpu_do_asic_reset(struct list_head
*device_list_handle,
tmp_adev->reset_vram_lost = vram_lost;
memset(&tmp_adev->reset_task_info, 0,
sizeof(tmp_adev->reset_task_info));
- if (reset_context->job && reset_context->job->vm)
- tmp_adev->reset_task_info =
- reset_context->job->vm->task_info;
- amdgpu_reset_capture_coredumpm(tmp_adev);
+ if (reset_context->job && reset_context->job->vm) {
+ struct amdgpu_task_info *ti;
+ struct amdgpu_vm *vm = reset_context->job->vm;
+
+ ti = amdgpu_vm_get_task_info_vm(vm);
+ if (ti) {
+ tmp_adev->reset_task_info = *ti;
+ amdgpu_reset_capture_coredumpm(tmp_adev);
+ amdgpu_vm_put_task_info_vm(ti, vm);
+ }
+ }
#endif
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..b89ee6ab7db9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat
amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
- struct amdgpu_task_info ti;
+ struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
@@ -58,12 +58,15 @@ static enum drm_gpu_sched_stat
amdgpu_job_timedout(struct drm_sched_job *s_job)
goto exit;
}
- amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
- DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
- job->base.sched->name,
atomic_read(&ring->fence_drv.last_seq),
- ring->fence_drv.sync_seq);
- DRM_ERROR("Process information: process %s pid %d thread %s pid
%d\n",
- ti.process_name, ti.tgid, ti.task_name, ti.pid);
+ ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
+ if (ti) {
+ DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
+ job->base.sched->name,
atomic_read(&ring->fence_drv.last_seq),
+ ring->fence_drv.sync_seq);
+ DRM_ERROR("Process information: process %s pid %d thread %s
pid %d\n",
+ ti->process_name, ti->tgid, ti->task_name, ti->pid);
+ amdgpu_vm_put_task_info_pasid(ring->adev, ti, job->pasid);
+ }
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 82f25996ff5e..0d655bffad7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2332,6 +2332,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev,
struct amdgpu_vm *vm)
amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
+ if (vm->task_info)
+ amdgpu_vm_put_task_info_vm(vm->task_info, vm);
+
flush_work(&vm->pt_free_work);
root = amdgpu_bo_ref(vm->root.bo);
@@ -2492,26 +2495,129 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
void *data, struct drm_file *filp)
return 0;
}
+static void amdgpu_vm_destroy_task_info(struct kref *kref)
+{
+ struct amdgpu_task_info *ti = container_of(kref, struct
amdgpu_task_info, refcount);
+
+ kfree(ti);
+}
+
+static inline struct amdgpu_vm *
+amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
+{
+ struct amdgpu_vm *vm;
+ unsigned long flags;
+
+ xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+ vm = xa_load(&adev->vm_manager.pasids, pasid);
+ xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+
+ return vm;
+}
+
+/**
+ * amdgpu_vm_put_task_info_pasid - reference down the vm task_info ptr
+ * frees the vm task_info ptr at the last put
+ *
+ * @adev: drm device pointer
+ * @task_info: task_info struct under discussion.
+ * @pasid: pasid of the VM which contains task_info
+ */
+void amdgpu_vm_put_task_info_pasid(struct amdgpu_device *adev,
+ struct amdgpu_task_info *task_info,
+ u32 pasid)
+{
+ int ret;
+
+ ret = kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
+
+ /* Clean up if object was removed in the last put */
+ if (ret == 1) {
+ struct amdgpu_vm *vm;
+
+ vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
+ if (!vm) {
+ WARN(1, "Invalid PASID %u to put task info\n", pasid);
+ return;
+ }
+
+ vm->task_info = NULL;
+ }
+}
+
+/**
+ * amdgpu_vm_put_task_info_vm - reference down the vm task_info ptr
+ * frees the vm task_info ptr at the last refdown
+ *
+ * @task_info: task_info struct under discussion.
+ * @vm: VM which contains task_info
+ */
+void amdgpu_vm_put_task_info_vm(struct amdgpu_task_info *task_info,
struct amdgpu_vm *vm)
+{
+ int ret;
+
+ ret = kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
+
+ /* Clean up if object was removed in the last put */
+ if (ret == 1)
+ vm->task_info = NULL;
+}
+
/**
- * amdgpu_vm_get_task_info - Extracts task info for a PASID.
+ * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
*
* @adev: drm device pointer
* @pasid: PASID identifier for VM
- * @task_info: task_info to fill.
+ *
+ * Returns the respective task_info structure, which must be referenced
+ * down with amdgpu_vm_put_task_info.
*/
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
- struct amdgpu_task_info *task_info)
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
{
struct amdgpu_vm *vm;
- unsigned long flags;
+ struct amdgpu_task_info *ti = NULL;
- xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+ vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
+ if (vm) {
+ ti = vm->task_info;
+ kref_get(&vm->task_info->refcount);
+ }
- vm = xa_load(&adev->vm_manager.pasids, pasid);
- if (vm)
- *task_info = vm->task_info;
+ return ti;
+}
- xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+/**
+ * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
+ *
+ * @vm: VM to get info from
+ *
+ * Returns the respective task_info structure, which must be referenced
+ * down with amdgpu_vm_put_task_info.
+ */
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
+{
+ struct amdgpu_task_info *ti = NULL;
+
+ if (vm) {
+ ti = vm->task_info;
+ kref_get(&vm->task_info->refcount);
+ }
+
+ return ti;
+}
+
+static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
+{
+ vm->task_info = kzalloc(sizeof(struct amdgpu_task_info),
GFP_KERNEL);
+ if (!vm->task_info) {
+ DRM_ERROR("OOM while creating task_info space\n");
+ return -ENOMEM;
+ }
+
+ kref_init(&vm->task_info->refcount);
+ return 0;
}
/**
@@ -2521,17 +2627,23 @@ void amdgpu_vm_get_task_info(struct
amdgpu_device *adev, u32 pasid,
*/
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
- if (vm->task_info.pid)
+ if (!vm->task_info) {
+ if (amdgpu_vm_create_task_info(vm))
+ /* OOM */
+ return;
+ }
+
+ if (vm->task_info->pid == current->pid)
return;
- vm->task_info.pid = current->pid;
- get_task_comm(vm->task_info.task_name, current);
+ vm->task_info->pid = current->pid;
+ get_task_comm(vm->task_info->task_name, current);
if (current->group_leader->mm != current->mm)
return;
- vm->task_info.tgid = current->group_leader->pid;
- get_task_comm(vm->task_info.process_name, current->group_leader);
+ vm->task_info->tgid = current->group_leader->pid;
+ get_task_comm(vm->task_info->process_name, current->group_leader);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 204ab13184ed..c2dce85d4f9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -183,10 +183,11 @@ struct amdgpu_vm_pte_funcs {
};
struct amdgpu_task_info {
- char process_name[TASK_COMM_LEN];
- char task_name[TASK_COMM_LEN];
- pid_t pid;
- pid_t tgid;
+ char process_name[TASK_COMM_LEN];
+ char task_name[TASK_COMM_LEN];
+ pid_t pid;
+ pid_t tgid;
+ struct kref refcount;
};
/**
@@ -334,7 +335,7 @@ struct amdgpu_vm {
uint64_t pd_phys_addr;
/* Some basic info about the task */
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
/* Store positions of group of BOs */
struct ttm_lru_bulk_move lru_bulk_move;
@@ -466,8 +467,17 @@ bool amdgpu_vm_need_pipeline_sync(struct
amdgpu_ring *ring,
struct amdgpu_job *job);
void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
- struct amdgpu_task_info *task_info);
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
+
+void amdgpu_vm_put_task_info_pasid(struct amdgpu_device *adev,
+ struct amdgpu_task_info *task_info,
+ u32 pasid);
+
+struct amdgpu_task_info *amdgpu_vm_get_task_info_vm(struct amdgpu_vm
*vm);
+
+void amdgpu_vm_put_task_info_vm(struct amdgpu_task_info *task_info,
struct amdgpu_vm *vm);
+
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
bool write_fault);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 96d601e209b8..f8323957d8bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -1032,7 +1032,7 @@ int amdgpu_vm_ptes_update(struct
amdgpu_vm_update_params *params,
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
min(nptes, 32u), dst, incr,
upd_flags,
- vm->task_info.tgid,
+ vm->task_info ? vm->task_info->tgid : 0,
vm->immediate.fence_context);
amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
cursor.level, pe_start, dst,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index fa87a85e1017..12c624876243 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -107,7 +107,7 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
uint32_t status = 0;
u64 addr;
@@ -155,18 +155,19 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
if (!printk_ratelimit())
return 0;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
-
- dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
process %s pid %d thread %s pid %d)\n",
- entry->vmid_src ? "mmhub" : "gfxhub",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
- dev_err(adev->dev, " in page starting at address 0x%016llx from
client 0x%x (%s)\n",
- addr, entry->client_id,
- soc15_ih_clientid_name[entry->client_id]);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {