These flags (for GEM and SVM allocations) allocate memory that allows for system-scope atomic semantics. On GFX943 these flags cause caches to be avoided on non-local memory. On all other ASICs they are identical in functionality to the equivalent COHERENT flags. Corresponding Thunk patch is at https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88 v3: changed name of flag v4: added checks for invalid flag combinations Reviewed-by: David Yat Sin <David.YatSin@xxxxxxx>w Signed-off-by: David Francis <David.Francis@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 1 + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 ++++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +++++++++- include/uapi/drm/amdgpu_drm.h | 10 +++++++++- include/uapi/linux/kfd_ioctl.h | 3 +++ 8 files changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index b5b940485059..ec9e9e95e5f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1738,9 +1738,16 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) alloc_flags |= AMDGPU_GEM_CREATE_COHERENT; + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT) + alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT; if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED; + if (((flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)) || + ((flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)) || + ((flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED))) + return -EINVAL; + *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); if (!*mem) { ret = -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 12210598e5b8..76b618735dc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -331,6 +331,7 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC | AMDGPU_GEM_CREATE_COHERENT | + AMDGPU_GEM_CREATE_EXT_COHERENT | AMDGPU_GEM_CREATE_UNCACHED); } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index fa87a85e1017..c45fde9f1887 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -634,6 +634,7 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev, } if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | + AMDGPU_GEM_CREATE_EXT_COHERENT | AMDGPU_GEM_CREATE_UNCACHED)) *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 671e288c7575..dd135d8b25f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -543,6 +543,7 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev, } if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | + AMDGPU_GEM_CREATE_EXT_COHERENT | AMDGPU_GEM_CREATE_UNCACHED)) *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 3d13d0bba7b1..e63d9c39fc35 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1185,7 +1185,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, { struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; - bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; + bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENT); + bool ext_coherent = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENT; bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; struct amdgpu_vm *vm = mapping->bo_va->base.vm; unsigned int mtype_local, mtype; @@ -1253,6 +1254,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, snoop = true; if (uncached) { mtype = MTYPE_UC; + } else if (ext_coherent) { + mtype = is_local ? MTYPE_CC : MTYPE_UC; } else if (adev->flags & AMD_IS_APU) { mtype = is_local ? mtype_local : MTYPE_NC; } else { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 841ba6102bbb..25de243ef0a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1187,7 +1187,8 @@ svm_range_get_pte_flags(struct kfd_node *node, uint32_t mapping_flags = 0; uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); - bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; + bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); + bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; @@ -1235,6 +1236,13 @@ svm_range_get_pte_flags(struct kfd_node *node, snoop = true; if (uncached) { mapping_flags |= AMDGPU_VM_MTYPE_UC; + } else if (ext_coherent) { + /* local HBM region close to partition */ + if (bo_node->adev == node->adev && + (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) + mapping_flags |= AMDGPU_VM_MTYPE_CC; + else + mapping_flags |= AMDGPU_VM_MTYPE_UC; } else if (domain == SVM_RANGE_VRAM_DOMAIN) { /* local HBM region close to partition */ if (bo_node->adev == node->adev && diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ec437120b083..984fc16577ca 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -150,7 +150,7 @@ extern "C" { */ #define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) /* Flag that BO is shared coherently between multiple devices or CPU threads. - * May depend on GPU instructions to flush caches explicitly + * May depend on GPU instructions to flush caches to system scope explicitly. * * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and * may override the MTYPE selected in AMDGPU_VA_OP_MAP. @@ -163,6 +163,14 @@ extern "C" { * may override the MTYPE selected in AMDGPU_VA_OP_MAP. */ #define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) struct drm_amdgpu_gem_create_in { /** the requested memory size */ diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index eeb2fdcbdcb7..f0ed68974c54 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) /* Allocate memory for later SVM (shared virtual memory) mapping. * @@ -659,6 +660,8 @@ enum kfd_mmio_remap { #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 /* Keep GPU memory mapping always valid as if XNACK is disable */ #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 +/* Fine grained coherency between all devices using device-scope atomics */ +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080 /** * kfd_ioctl_svm_op - SVM ioctl operations -- 2.34.1