[PATCH 4/4] libhsakmt: allocate unified memory for ctx save restore area

Eric Huang <jinhuieric.huang@xxxxxxx> · Tue, 28 Jun 2022 17:43:15 -0400

To improve performance on queue preemption, allocate ctx s/r
 area in VRAM instead of system memory, and migrate it back
 to system memory when VRAM is full.

Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx>
Change-Id: If775782027188dbe84b6868260e429373675434c
---
 include/hsakmttypes.h |   1 +
 src/queues.c          | 109 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h
index 9063f85..2c1c7cc 100644
--- a/include/hsakmttypes.h
+++ b/include/hsakmttypes.h
@@ -1329,6 +1329,7 @@ typedef enum _HSA_SVM_FLAGS {
 	HSA_SVM_FLAG_GPU_RO      = 0x00000008, // GPUs only read, allows replication
 	HSA_SVM_FLAG_GPU_EXEC    = 0x00000010, // Allow execution on GPU
 	HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
+	HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
 } HSA_SVM_FLAGS;
 
 typedef enum _HSA_SVM_ATTR_TYPE {
diff --git a/src/queues.c b/src/queues.c
index c83dd93..e65103d 100644
--- a/src/queues.c
+++ b/src/queues.c
@@ -68,6 +68,7 @@ struct queue {
 	uint32_t eop_buffer_size;
 	uint32_t gfxv;
 	bool use_ats;
+	bool unified_ctx_save_restore;
 	/* This queue structure is allocated from GPU with page aligned size
 	 * but only small bytes are used. We use the extra space in the end for
 	 * cu_mask bits array.
@@ -383,13 +384,50 @@ static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align,
 		munmap(addr, size);
 }
 
+static HSAKMT_STATUS register_exec_svm_range(void *mem, uint32_t size,
+				uint32_t gpuNode, uint32_t prefetchNode,
+				uint32_t preferredNode, bool alwaysMapped)
+{
+	HSA_SVM_ATTRIBUTE *attrs;
+	HSAuint64 s_attr;
+	HSAuint32 nattr;
+	HSAuint32 flags;
+
+	flags = HSA_SVM_FLAG_HOST_ACCESS |
+	        HSA_SVM_FLAG_GPU_EXEC;
+
+	if (alwaysMapped)
+		flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED;
+
+	nattr = 5;
+	s_attr = sizeof(*attrs) * nattr;
+	attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr);
+
+	attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC;
+	attrs[0].value = prefetchNode;
+	attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC;
+	attrs[1].value = preferredNode;
+	attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS;
+	attrs[2].value = flags;
+	attrs[3].type = HSA_SVM_ATTR_SET_FLAGS;
+	attrs[3].value = flags;
+	attrs[4].type = HSA_SVM_ATTR_ACCESS;
+	attrs[4].value = gpuNode;
+
+	return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
+}
+
 static void free_queue(struct queue *q)
 {
 	if (q->eop_buffer)
 		free_exec_aligned_memory(q->eop_buffer,
 					 q->eop_buffer_size,
 					 PAGE_SIZE, q->use_ats);
-	if (q->ctx_save_restore)
+	if (q->unified_ctx_save_restore)
+		munmap(q->ctx_save_restore,
+			ALIGN_UP(q->ctx_save_restore_size + q->debug_memory_size,
+					PAGE_SIZE));
+	else if (q->ctx_save_restore)
 		free_exec_aligned_memory(q->ctx_save_restore,
 					 q->ctx_save_restore_size,
 					 PAGE_SIZE, q->use_ats);
@@ -425,6 +463,8 @@ static int handle_concrete_asic(struct queue *q,
 	if (ret) {
 		uint32_t total_mem_alloc_size = 0;
 		HsaUserContextSaveAreaHeader *header;
+		HsaNodeProperties node;
+		bool svm_api;
 
 		args->ctx_save_restore_size = q->ctx_save_restore_size;
 		args->ctl_stack_size = q->ctl_stack_size;
@@ -434,22 +474,63 @@ static int handle_concrete_asic(struct queue *q,
 		 */
 		total_mem_alloc_size = q->ctx_save_restore_size +
 				       q->debug_memory_size;
-		q->ctx_save_restore =
-			allocate_exec_aligned_memory(total_mem_alloc_size,
-					 q->use_ats, NodeId, false, false);
 
-		if (!q->ctx_save_restore)
-			return HSAKMT_STATUS_NO_MEMORY;
+		if (hsaKmtGetNodeProperties(NodeId, &node))
+			svm_api = false;
+		else
+			svm_api = node.Capability.ui32.SVMAPISupported;
 
-		args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
+		/* Allocate unified memory for context save restore
+		 * area on dGPU.
+		 */
+		if (!q->use_ats && svm_api) {
+			uint32_t size = ALIGN_UP(total_mem_alloc_size, PAGE_SIZE);
+			void *addr;
+			HSAKMT_STATUS r = HSAKMT_STATUS_ERROR;
+
+			addr = mmap(0, size, PROT_READ | PROT_WRITE,
+					MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+			if (addr == MAP_FAILED)
+				pr_err("[%s] mmap failed: %s\n", __func__, strerror(errno));
+			else {
+				header = (HsaUserContextSaveAreaHeader *)addr;
+				header->ErrorEventId = 0;
+				if (Event)
+					header->ErrorEventId = Event->EventId;
+				header->ErrorReason = ErrPayload;
+				header->DebugOffset = q->ctx_save_restore_size;
+				header->DebugSize = q->debug_memory_size;
+
+				r = register_exec_svm_range(addr, size,
+						NodeId, NodeId, 0, true);
+
+				if (r == HSAKMT_STATUS_SUCCESS) {
+					q->ctx_save_restore = addr;
+					q->unified_ctx_save_restore = true;
+				} else
+					munmap(addr, size);
+			}
+		}
 
-		header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
-		header->ErrorEventId = 0;
-		if (Event)
-			header->ErrorEventId = Event->EventId;
-		header->ErrorReason = ErrPayload;
-		header->DebugOffset = q->ctx_save_restore_size;
-		header->DebugSize = q->debug_memory_size;
+		if (!q->unified_ctx_save_restore) {
+			q->ctx_save_restore = allocate_exec_aligned_memory(
+							total_mem_alloc_size,
+							q->use_ats, NodeId, false, false);
+
+			if (!q->ctx_save_restore)
+				return HSAKMT_STATUS_NO_MEMORY;
+
+			header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
+			header->ErrorEventId = 0;
+			if (Event)
+				header->ErrorEventId = Event->EventId;
+			header->ErrorReason = ErrPayload;
+			header->DebugOffset = q->ctx_save_restore_size;
+			header->DebugSize = q->debug_memory_size;
+		}
+
+		args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
 	}
 
 	return HSAKMT_STATUS_SUCCESS;
-- 
2.25.1