To improve performance on queue preemption, allocate ctx s/r area in VRAM instead of system memory, and migrate it back to system memory when VRAM is full. Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx> Change-Id: If775782027188dbe84b6868260e429373675434c --- include/hsakmttypes.h | 1 + src/queues.c | 103 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 14 deletions(-) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 9063f85..2c1c7cc 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1329,6 +1329,7 @@ typedef enum _HSA_SVM_FLAGS { HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault + HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable } HSA_SVM_FLAGS; typedef enum _HSA_SVM_ATTR_TYPE { diff --git a/src/queues.c b/src/queues.c index c83dd93..d5109f9 100644 --- a/src/queues.c +++ b/src/queues.c @@ -68,6 +68,7 @@ struct queue { uint32_t eop_buffer_size; uint32_t gfxv; bool use_ats; + bool unified_ctx_save_restore; /* This queue structure is allocated from GPU with page aligned size * but only small bytes are used. We use the extra space in the end for * cu_mask bits array. @@ -383,13 +384,47 @@ static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align, munmap(addr, size); } +static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size, + uint32_t gpuNode, uint32_t prefetchNode, + uint32_t preferredNode, bool alwaysMapped) +{ + HSA_SVM_ATTRIBUTE *attrs; + HSAuint64 s_attr; + HSAuint32 nattr; + HSAuint32 flags; + + flags = HSA_SVM_FLAG_HOST_ACCESS; + + if (alwaysMapped) + flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED; + + nattr = 5; + s_attr = sizeof(*attrs) * nattr; + attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr); + + attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC; + attrs[0].value = prefetchNode; + attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC; + attrs[1].value = preferredNode; + attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS; + attrs[2].value = ~flags; + attrs[3].type = HSA_SVM_ATTR_SET_FLAGS; + attrs[3].value = flags; + attrs[4].type = HSA_SVM_ATTR_ACCESS; + attrs[4].value = gpuNode; + + return hsaKmtSVMSetAttr(mem, size, nattr, attrs); +} + static void free_queue(struct queue *q) { if (q->eop_buffer) free_exec_aligned_memory(q->eop_buffer, q->eop_buffer_size, PAGE_SIZE, q->use_ats); - if (q->ctx_save_restore) + if (q->unified_ctx_save_restore) + free(q->ctx_save_restore); + else if (q->ctx_save_restore) free_exec_aligned_memory(q->ctx_save_restore, q->ctx_save_restore_size, PAGE_SIZE, q->use_ats); @@ -425,6 +460,8 @@ static int handle_concrete_asic(struct queue *q, if (ret) { uint32_t total_mem_alloc_size = 0; HsaUserContextSaveAreaHeader *header; + HsaNodeProperties node; + bool svm_api; args->ctx_save_restore_size = q->ctx_save_restore_size; args->ctl_stack_size = q->ctl_stack_size; @@ -434,22 +471,60 @@ static int handle_concrete_asic(struct queue *q, */ total_mem_alloc_size = q->ctx_save_restore_size + q->debug_memory_size; - q->ctx_save_restore = - allocate_exec_aligned_memory(total_mem_alloc_size, - q->use_ats, NodeId, false, false); - if (!q->ctx_save_restore) - return HSAKMT_STATUS_NO_MEMORY; + if (hsaKmtGetNodeProperties(NodeId, &node)) + svm_api = false; + else + svm_api = node.Capability.ui32.SVMAPISupported; - args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore; + /* Allocate unified memory for context save restore + * area on dGPU. + */ + if (!q->use_ats && svm_api) { + uint32_t size = PAGE_ALIGN_UP(total_mem_alloc_size); + void *addr; + HSAKMT_STATUS r = HSAKMT_STATUS_ERROR; + + if (posix_memalign(&addr, GPU_HUGE_PAGE_SIZE, size)) + pr_err("[%s] posix_memalign failed:\n", __func__); + else { + header = (HsaUserContextSaveAreaHeader *)addr; + header->ErrorEventId = 0; + if (Event) + header->ErrorEventId = Event->EventId; + header->ErrorReason = ErrPayload; + header->DebugOffset = q->ctx_save_restore_size; + header->DebugSize = q->debug_memory_size; + + r = register_svm_range(addr, size, + NodeId, NodeId, 0, true); + + if (r == HSAKMT_STATUS_SUCCESS) { + q->ctx_save_restore = addr; + q->unified_ctx_save_restore = true; + } else + free(addr); + } + } - header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore; - header->ErrorEventId = 0; - if (Event) - header->ErrorEventId = Event->EventId; - header->ErrorReason = ErrPayload; - header->DebugOffset = q->ctx_save_restore_size; - header->DebugSize = q->debug_memory_size; + if (!q->unified_ctx_save_restore) { + q->ctx_save_restore = allocate_exec_aligned_memory( + total_mem_alloc_size, + q->use_ats, NodeId, false, false); + + if (!q->ctx_save_restore) + return HSAKMT_STATUS_NO_MEMORY; + + header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore; + header->ErrorEventId = 0; + if (Event) + header->ErrorEventId = Event->EventId; + header->ErrorReason = ErrPayload; + header->DebugOffset = q->ctx_save_restore_size; + header->DebugSize = q->debug_memory_size; + } + + args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore; } return HSAKMT_STATUS_SUCCESS; -- 2.25.1