[PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently multiple partitions will incorrectly overwrite the VM lookup
table since the table is indexed by PASID and multiple partitions can
register different VM objects on the same PASID.

This results in loading the wrong VM object on PASID query.

To correct this, setup the lookup table to be per-partition-per-PASID
instead.

Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    | 12 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |  7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 55 +++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        | 11 +++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  5 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c        |  5 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c        |  5 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c         |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c         |  5 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 16 ++----
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c        |  2 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c      |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  |  8 +--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  8 +--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   |  3 +-
 18 files changed, 92 insertions(+), 65 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index c272461d70a9..28db789610e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -887,3 +887,15 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 
 	return r;
 }
+
+int amdgpu_amdkfd_node_id_to_xcc_id(struct amdgpu_device *adev, uint32_t node_id)
+{
+	if (adev->gfx.funcs->ih_node_to_logical_xcc) {
+		int xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev, node_id);
+
+		if (xcc_id >= 0)
+			return xcc_id;
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4ed49265c764..bf8bb45d8ab6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -356,6 +356,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
 		uint64_t size, u32 alloc_flag, int8_t xcp_id);
 
 u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
+int amdgpu_amdkfd_node_id_to_xcc_id(struct amdgpu_device *adev, uint32_t node_id);
 
 #define KFD_XCP_MEM_ID(adev, xcp_id) \
 		((adev)->xcp_mgr && (xcp_id) >= 0 ?\
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..bf9f8802e18d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,7 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	struct amdgpu_job *job = to_amdgpu_job(s_job);
 	struct amdgpu_task_info *ti;
 	struct amdgpu_device *adev = ring->adev;
-	int idx;
+	int idx, xcp_id = !job->vm ? 0 : job->vm->xcp_id;
 	int r;
 
 	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -62,7 +62,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
 		ring->fence_drv.sync_seq);
 
-	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
+	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid, xcp_id);
 	if (ti) {
 		dev_err(adev->dev,
 			"Process information: process %s pid %d thread %s pid %d\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d9fde38f6ee2..e413bf4a3e84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1275,17 +1275,20 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		struct amdgpu_vm *vm = &fpriv->vm;
 		struct drm_amdgpu_info_gpuvm_fault gpuvm_fault;
 		unsigned long flags;
+		int i;
 
 		if (!vm)
 			return -EINVAL;
 
 		memset(&gpuvm_fault, 0, sizeof(gpuvm_fault));
 
-		xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+		for (i = 0; i < adev->xcp_mgr->num_xcps; i++)
+			xa_lock_irqsave(&adev->vm_manager.pasids[i], flags);
 		gpuvm_fault.addr = vm->fault_info.addr;
 		gpuvm_fault.status = vm->fault_info.status;
 		gpuvm_fault.vmhub = vm->fault_info.vmhub;
-		xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+		for (i = 0; i < adev->xcp_mgr->num_xcps; i++)
+			xa_unlock_irqrestore(&adev->vm_manager.pasids[i], flags);
 
 		return copy_to_user(out, &gpuvm_fault,
 				    min((size_t)size, sizeof(gpuvm_fault))) ? -EFAULT : 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bcb729094521..f43e1c15f423 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -146,7 +146,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		return 0;
 
 	if (vm->pasid) {
-		r = xa_err(xa_erase_irq(&adev->vm_manager.pasids, vm->pasid));
+		r = xa_err(xa_erase_irq(&adev->vm_manager.pasids[vm->xcp_id], vm->pasid));
 		if (r < 0)
 			return r;
 
@@ -154,7 +154,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	}
 
 	if (pasid) {
-		r = xa_err(xa_store_irq(&adev->vm_manager.pasids, pasid, vm,
+		r = xa_err(xa_store_irq(&adev->vm_manager.pasids[vm->xcp_id], pasid, vm,
 					GFP_KERNEL));
 		if (r < 0)
 			return r;
@@ -2288,14 +2288,14 @@ static void amdgpu_vm_destroy_task_info(struct kref *kref)
 }
 
 static inline struct amdgpu_vm *
-amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
+amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id)
 {
 	struct amdgpu_vm *vm;
 	unsigned long flags;
 
-	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
-	vm = xa_load(&adev->vm_manager.pasids, pasid);
-	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], flags);
+	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
+	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], flags);
 
 	return vm;
 }
@@ -2343,10 +2343,10 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
  * referenced down with amdgpu_vm_put_task_info.
  */
 struct amdgpu_task_info *
-amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id)
 {
 	return amdgpu_vm_get_task_info_vm(
-			amdgpu_vm_get_vm_from_pasid(adev, pasid));
+			amdgpu_vm_get_vm_from_pasid(adev, pasid, xcp_id));
 }
 
 static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
@@ -2481,6 +2481,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	amdgpu_bo_unreserve(vm->root.bo);
 	amdgpu_bo_unref(&root_bo);
 
+	vm->xcp_id = xcp_id < 0 ? 0 : xcp_id;
+
 	return 0;
 
 error_free_root:
@@ -2695,8 +2697,8 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
 #else
 	adev->vm_manager.vm_update_mode = 0;
 #endif
-
-	xa_init_flags(&adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ);
+	for (i = 0; i < MAX_XCP; i++)
+		xa_init_flags(&(adev->vm_manager.pasids[i]), XA_FLAGS_LOCK_IRQ);
 }
 
 /**
@@ -2708,10 +2710,15 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
  */
 void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
 {
-	WARN_ON(!xa_empty(&adev->vm_manager.pasids));
-	xa_destroy(&adev->vm_manager.pasids);
+	int i;
+
+	for (i = 0; i < MAX_XCP; i++) {
+		WARN_ON(!xa_empty(&adev->vm_manager.pasids[i]));
+		xa_destroy(&adev->vm_manager.pasids[i]);
+	}
 
 	amdgpu_vmid_mgr_fini(adev);
+
 }
 
 /**
@@ -2778,17 +2785,18 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 	unsigned long irqflags;
 	uint64_t value, flags;
 	struct amdgpu_vm *vm;
-	int r;
+	int r, xcp_id;
 
-	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
-	vm = xa_load(&adev->vm_manager.pasids, pasid);
+	xcp_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, node_id)/adev->gfx.num_xcc_per_xcp;
+	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], irqflags);
+	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
 	if (vm) {
 		root = amdgpu_bo_ref(vm->root.bo);
 		is_compute_context = vm->is_compute_context;
 	} else {
 		root = NULL;
 	}
-	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
+	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], irqflags);
 
 	if (!root)
 		return false;
@@ -2806,11 +2814,11 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 		goto error_unref;
 
 	/* Double check that the VM still exists */
-	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
-	vm = xa_load(&adev->vm_manager.pasids, pasid);
+	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], irqflags);
+	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
 	if (vm && vm->root.bo != root)
 		vm = NULL;
-	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
+	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], irqflags);
 	if (!vm)
 		goto error_unlock;
 
@@ -2968,14 +2976,15 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 				  unsigned int pasid,
 				  uint64_t addr,
 				  uint32_t status,
-				  unsigned int vmhub)
+				  unsigned int vmhub,
+				  uint32_t xcp_id)
 {
 	struct amdgpu_vm *vm;
 	unsigned long flags;
 
-	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], flags);
 
-	vm = xa_load(&adev->vm_manager.pasids, pasid);
+	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
 	/* Don't update the fault cache if status is 0.  In the multiple
 	 * fault case, subsequent faults will return a 0 status which is
 	 * useless for userspace and replaces the useful fault status, so
@@ -3008,7 +3017,7 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 			WARN_ONCE(1, "Invalid vmhub %u\n", vmhub);
 		}
 	}
-	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], flags);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 046949c4b695..1499f5f731e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -35,6 +35,7 @@
 #include "amdgpu_sync.h"
 #include "amdgpu_ring.h"
 #include "amdgpu_ids.h"
+#include "amdgpu_xcp.h"
 
 struct drm_exec;
 
@@ -418,6 +419,9 @@ struct amdgpu_vm {
 
 	/* cached fault info */
 	struct amdgpu_vm_fault_info fault_info;
+
+	/* XCP ID */
+	int xcp_id;
 };
 
 struct amdgpu_vm_manager {
@@ -456,7 +460,7 @@ struct amdgpu_vm_manager {
 	/* PASID to VM mapping, will be used in interrupt context to
 	 * look up VM of a page fault
 	 */
-	struct xarray				pasids;
+	struct xarray				pasids[MAX_XCP];
 	/* Global registration of recent page fault information */
 	struct amdgpu_vm_fault_info	fault_info;
 };
@@ -550,7 +554,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
 struct amdgpu_task_info *
-amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id);
 
 struct amdgpu_task_info *
 amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
@@ -649,7 +653,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 				  unsigned int pasid,
 				  uint64_t addr,
 				  uint32_t status,
-				  unsigned int vmhub);
+				  unsigned int vmhub,
+				  uint32_t xcp_id);
 void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
 				 struct amdgpu_vm *vm,
 				 struct dma_fence **fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index f0ceab3ce5bf..24b042febf5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -151,7 +151,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
 		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
-					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
+					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
+					     0);
 	}
 
 	if (!printk_ratelimit())
@@ -161,7 +162,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
 		entry->vmid_src ? "mmhub" : "gfxhub",
 		entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
-	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
 	if (task_info) {
 		dev_err(adev->dev,
 			" in process %s pid %d thread %s pid %d\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 2797fd84432b..3507046d33e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -122,7 +122,8 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
 		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
-					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
+					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
+					     0);
 	}
 
 	if (printk_ratelimit()) {
@@ -132,7 +133,7 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
 			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
 			entry->vmid_src ? "mmhub" : "gfxhub",
 			entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
-		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
 		if (task_info) {
 			dev_err(adev->dev,
 				" in process %s pid %d thread %s pid %d)\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 60acf676000b..9844564c6c74 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -115,7 +115,8 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
 		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
-					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
+					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
+					     0);
 	}
 
 	if (printk_ratelimit()) {
@@ -125,7 +126,7 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
 			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
 			entry->vmid_src ? "mmhub" : "gfxhub",
 			entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
-		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
 		if (task_info) {
 			dev_err(adev->dev,
 				" in process %s pid %d thread %s pid %d)\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 994432fb57ea..2cdb0cbb7c4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1268,7 +1268,8 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
 		return 0;
 
 	amdgpu_vm_update_fault_cache(adev, entry->pasid,
-				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
+				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status,
+				     AMDGPU_GFXHUB(0), 0);
 
 	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
 		gmc_v7_0_set_fault_enable_default(adev, false);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 86488c052f82..6855caeb7f74 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1437,7 +1437,8 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 		return 0;
 
 	amdgpu_vm_update_fault_cache(adev, entry->pasid,
-				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
+				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status,
+				     AMDGPU_GFXHUB(0), 0);
 
 	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
 		gmc_v8_0_set_fault_enable_default(adev, false);
@@ -1448,7 +1449,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 		dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
 			entry->src_id, entry->src_data[0]);
 
-		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
 		if (task_info) {
 			dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
 				task_info->process_name, task_info->tgid,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b73136d390cc..e183e08b2c02 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -556,10 +556,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 	unsigned int vmhub;
 	u64 addr;
 	uint32_t cam_index = 0;
-	int ret, xcc_id = 0;
-	uint32_t node_id;
+	int ret;
+	uint32_t node_id, xcc_id, xcp_id;
 
 	node_id = entry->node_id;
+	xcc_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, node_id);
+	xcp_id = xcc_id/adev->gfx.num_xcc_per_xcp;
 
 	addr = (u64)entry->src_data[0] << 12;
 	addr |= ((u64)entry->src_data[1] & 0xf) << 44;
@@ -572,12 +574,6 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 		vmhub = AMDGPU_MMHUB1(0);
 	} else {
 		hub_name = "gfxhub0";
-		if (adev->gfx.funcs->ih_node_to_logical_xcc) {
-			xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev,
-				node_id);
-			if (xcc_id < 0)
-				xcc_id = 0;
-		}
 		vmhub = xcc_id;
 	}
 	hub = &adev->vmhub[vmhub];
@@ -631,7 +627,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 		retry_fault ? "retry" : "no-retry",
 		entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
 
-	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, xcp_id);
 	if (task_info) {
 		dev_err(adev->dev,
 			" for process %s pid %d thread %s pid %d)\n",
@@ -675,7 +671,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 	if (!amdgpu_sriov_vf(adev))
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
+	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub, xcp_id);
 
 	dev_err(adev->dev,
 		"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 23ef4eb36b40..1ac4224bbe5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2182,7 +2182,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
 			   instance, addr, entry->src_id, entry->ring_id, entry->vmid,
 			   entry->pasid);
 
-	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
 	if (task_info) {
 		dev_dbg_ratelimited(adev->dev,
 				    " for process %s pid %d thread %s pid %d\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 57f16c09abfc..c8b5c0302ca7 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1683,6 +1683,8 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
 	int instance;
 	struct amdgpu_task_info *task_info;
 	u64 addr;
+	uint32_t xcc_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, entry->node_id);
+	uint32_t xcp_id = xcc_id/adev->gfx.num_xcc_per_xcp;
 
 	instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
 	if (instance < 0 || instance >= adev->sdma.num_instances) {
@@ -1698,7 +1700,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
 			    instance, addr, entry->src_id, entry->ring_id, entry->vmid,
 			    entry->pasid);
 
-	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, xcp_id);
 	if (task_info) {
 		dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
 				    task_info->process_name, task_info->tgid,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index ea3792249209..c098fbaf0e1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1262,8 +1262,9 @@ void kfd_signal_reset_event(struct kfd_node *dev)
 
 		if (dev->dqm->detect_hang_count) {
 			struct amdgpu_task_info *ti;
+			uint32_t xcp_id = dev->xcp ? dev->xcp->id : 0;
 
-			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid, xcp_id);
 			if (ti) {
 				dev_err(dev->adev->dev,
 					"Queues reset on process %s tid %d thread %s pid %d\n",
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 8e0d0356e810..d7cbf9525698 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -377,12 +377,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
 		struct kfd_hsa_memory_exception_data exception_data;
 
 		/* gfxhub */
-		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
-			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
-				node_id);
-			if (hub_inst < 0)
-				hub_inst = 0;
-		}
+		if (!vmid_type)
+			hub_inst = amdgpu_amdkfd_node_id_to_xcc_id(dev->adev, node_id);
 
 		/* mmhub */
 		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a9c3580be8c9..4708b8c811a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -437,12 +437,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
 		struct kfd_hsa_memory_exception_data exception_data;
 
 		/* gfxhub */
-		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
-			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
-				node_id);
-			if (hub_inst < 0)
-				hub_inst = 0;
-		}
+		if (!vmid_type)
+			hub_inst = amdgpu_amdkfd_node_id_to_xcc_id(dev->adev, node_id);
 
 		/* mmhub */
 		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index ea6a8e43bd5b..b5f2f5b1069c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -251,8 +251,9 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
 void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
 {
 	struct amdgpu_task_info *task_info;
+	uint32_t xcp_id = dev->xcp ? dev->xcp->id : 0;
 
-	task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
+	task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid, xcp_id);
 	if (task_info) {
 		/* Report VM faults from user applications, not retry from kernel */
 		if (task_info->pid)
-- 
2.34.1




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux