To improve performance on queue preemption, allocate ctx s/r
area in VRAM instead of system memory, and migrate it back
to system memory when VRAM is full.
Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx>
Change-Id: If775782027188dbe84b6868260e429373675434c
---
include/hsakmttypes.h | 1 +
src/queues.c | 103 ++++++++++++++++++++++++++++++++++++------
2 files changed, 90 insertions(+), 14 deletions(-)
diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h
index 9063f85..2c1c7cc 100644
--- a/include/hsakmttypes.h
+++ b/include/hsakmttypes.h
@@ -1329,6 +1329,7 @@ typedef enum _HSA_SVM_FLAGS {
HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication
HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
+ HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
} HSA_SVM_FLAGS;
typedef enum _HSA_SVM_ATTR_TYPE {
diff --git a/src/queues.c b/src/queues.c
index c83dd93..d5109f9 100644
--- a/src/queues.c
+++ b/src/queues.c
@@ -68,6 +68,7 @@ struct queue {
uint32_t eop_buffer_size;
uint32_t gfxv;
bool use_ats;
+ bool unified_ctx_save_restore;
/* This queue structure is allocated from GPU with page aligned size
* but only small bytes are used. We use the extra space in the end for
* cu_mask bits array.
@@ -383,13 +384,47 @@ static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align,
munmap(addr, size);
}
+static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
+ uint32_t gpuNode, uint32_t prefetchNode,
+ uint32_t preferredNode, bool alwaysMapped)
+{
+ HSA_SVM_ATTRIBUTE *attrs;
+ HSAuint64 s_attr;
+ HSAuint32 nattr;
+ HSAuint32 flags;
+
+ flags = HSA_SVM_FLAG_HOST_ACCESS;
+
+ if (alwaysMapped)
+ flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED;
+
+ nattr = 5;
+ s_attr = sizeof(*attrs) * nattr;
+ attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr);
+
+ attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC;
+ attrs[0].value = prefetchNode;
+ attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC;
+ attrs[1].value = preferredNode;
+ attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS;
+ attrs[2].value = ~flags;
+ attrs[3].type = HSA_SVM_ATTR_SET_FLAGS;
+ attrs[3].value = flags;
+ attrs[4].type = HSA_SVM_ATTR_ACCESS;
+ attrs[4].value = gpuNode;
+
+ return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
+}
+
static void free_queue(struct queue *q)
{
if (q->eop_buffer)
free_exec_aligned_memory(q->eop_buffer,
q->eop_buffer_size,
PAGE_SIZE, q->use_ats);
- if (q->ctx_save_restore)
+ if (q->unified_ctx_save_restore)
+ free(q->ctx_save_restore);
+ else if (q->ctx_save_restore)
free_exec_aligned_memory(q->ctx_save_restore,
q->ctx_save_restore_size,
PAGE_SIZE, q->use_ats);
@@ -425,6 +460,8 @@ static int handle_concrete_asic(struct queue *q,
if (ret) {
uint32_t total_mem_alloc_size = 0;
HsaUserContextSaveAreaHeader *header;
+ HsaNodeProperties node;
+ bool svm_api;
args->ctx_save_restore_size = q->ctx_save_restore_size;
args->ctl_stack_size = q->ctl_stack_size;
@@ -434,22 +471,60 @@ static int handle_concrete_asic(struct queue *q,
*/
total_mem_alloc_size = q->ctx_save_restore_size +
q->debug_memory_size;
- q->ctx_save_restore =
- allocate_exec_aligned_memory(total_mem_alloc_size,
- q->use_ats, NodeId, false, false);
- if (!q->ctx_save_restore)
- return HSAKMT_STATUS_NO_MEMORY;
+ if (hsaKmtGetNodeProperties(NodeId, &node))
+ svm_api = false;
+ else
+ svm_api = node.Capability.ui32.SVMAPISupported;
- args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
+ /* Allocate unified memory for context save restore
+ * area on dGPU.
+ */
+ if (!q->use_ats && svm_api) {
+ uint32_t size = PAGE_ALIGN_UP(total_mem_alloc_size);
+ void *addr;
+ HSAKMT_STATUS r = HSAKMT_STATUS_ERROR;
+
+ if (posix_memalign(&addr, GPU_HUGE_PAGE_SIZE, size))
+ pr_err("[%s] posix_memalign failed:\n", __func__);
+ else {
+ header = (HsaUserContextSaveAreaHeader *)addr;
+ header->ErrorEventId = 0;
+ if (Event)
+ header->ErrorEventId = Event->EventId;
+ header->ErrorReason = ErrPayload;
+ header->DebugOffset = q->ctx_save_restore_size;
+ header->DebugSize = q->debug_memory_size;
+
+ r = register_svm_range(addr, size,
+ NodeId, NodeId, 0, true);
+
+ if (r == HSAKMT_STATUS_SUCCESS) {
+ q->ctx_save_restore = addr;
+ q->unified_ctx_save_restore = true;
+ } else
+ free(addr);
+ }
+ }
- header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
- header->ErrorEventId = 0;
- if (Event)
- header->ErrorEventId = Event->EventId;
- header->ErrorReason = ErrPayload;
- header->DebugOffset = q->ctx_save_restore_size;
- header->DebugSize = q->debug_memory_size;
+ if (!q->unified_ctx_save_restore) {
+ q->ctx_save_restore = allocate_exec_aligned_memory(
+ total_mem_alloc_size,
+ q->use_ats, NodeId, false, false);
+
+ if (!q->ctx_save_restore)
+ return HSAKMT_STATUS_NO_MEMORY;
+
+ header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
+ header->ErrorEventId = 0;
+ if (Event)
+ header->ErrorEventId = Event->EventId;
+ header->ErrorReason = ErrPayload;
+ header->DebugOffset = q->ctx_save_restore_size;
+ header->DebugSize = q->debug_memory_size;
+ }