On Wed, Jul 26, 2023 at 9:21 AM David Francis <David.Francis@xxxxxxx> wrote: > > These flags (for GEM and SVM allocations) allocate > memory that allows for system-scope atomic semantics. > > On GFX943 these flags cause caches to be avoided on > non-local memory. > > On all other ASICs they are identical in functionality to the > equivalent COHERENT flags. Please include a link to the proposed userspace which uses these new flags. Alex > > Signed-off-by: David Francis <David.Francis@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 1 + > drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 1 + > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 ++++- > drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +++++++++- > include/uapi/drm/amdgpu_drm.h | 7 +++++++ > include/uapi/linux/kfd_ioctl.h | 3 +++ > 7 files changed, 27 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index d34c3ef8f3ed..7f23bc0ee592 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -1738,6 +1738,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( > > if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) > alloc_flags |= AMDGPU_GEM_CREATE_COHERENT; > + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENCE) > + alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENCE; > if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) > alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED; > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index 6b430e10d38e..8e951688668b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -632,6 +632,7 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev, > } > > if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > + AMDGPU_GEM_CREATE_EXT_COHERENCE | > AMDGPU_GEM_CREATE_UNCACHED)) > *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | > AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > index a6ee0220db56..ff330c7c0232 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > @@ -540,6 +540,7 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev, > } > > if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > + AMDGPU_GEM_CREATE_EXT_COHERENCE | > AMDGPU_GEM_CREATE_UNCACHED)) > *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | > AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 880460cd3239..e40fcfc1a3f3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -1183,7 +1183,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, > { > struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); > bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; > - bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; > + bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENCE); > + bool ext_coherence = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENCE; > bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; > struct amdgpu_vm *vm = mapping->bo_va->base.vm; > unsigned int mtype_local, mtype; > @@ -1251,6 +1252,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, > snoop = true; > if (uncached) { > mtype = MTYPE_UC; > + } else if (ext_coherence) { > + mtype = is_local ? MTYPE_CC : MTYPE_UC; > } else if (adev->flags & AMD_IS_APU) { > mtype = is_local ? mtype_local : MTYPE_NC; > } else { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > index 1b50eae051a4..28304b93a990 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > @@ -1155,7 +1155,8 @@ svm_range_get_pte_flags(struct kfd_node *node, > uint32_t mapping_flags = 0; > uint64_t pte_flags; > bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); > - bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; > + bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENCE); > + bool ext_coherence = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENCE; > bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ > unsigned int mtype_local; > > @@ -1203,6 +1204,13 @@ svm_range_get_pte_flags(struct kfd_node *node, > snoop = true; > if (uncached) { > mapping_flags |= AMDGPU_VM_MTYPE_UC; > + } else if (ext_coherence) { > + /* local HBM region close to partition */ > + if (bo_node->adev == node->adev && > + (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) > + mapping_flags |= AMDGPU_VM_MTYPE_CC; > + else > + mapping_flags |= AMDGPU_VM_MTYPE_UC; > } else if (domain == SVM_RANGE_VRAM_DOMAIN) { > /* local HBM region close to partition */ > if (bo_node->adev == node->adev && > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 79b14828d542..d67102dc214a 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -158,6 +158,13 @@ extern "C" { > * may override the MTYPE selected in AMDGPU_VA_OP_MAP. > */ > #define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) > +/* Flag that BO should be coherent across devices when using device-level > + * atomics. May depend on GPU instructions to flush caches explicitly. > + * > + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and > + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. > + */ > +#define AMDGPU_GEM_CREATE_EXT_COHERENCE (1 << 15) > > struct drm_amdgpu_gem_create_in { > /** the requested memory size */ > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index eeb2fdcbdcb7..dd8f0dba7631 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args { > #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) > #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) > #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) > +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENCE (1 << 24) > > /* Allocate memory for later SVM (shared virtual memory) mapping. > * > @@ -659,6 +660,8 @@ enum kfd_mmio_remap { > #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 > /* Keep GPU memory mapping always valid as if XNACK is disable */ > #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 > +/* Fine grained coherency between all devices using device-scope atomics */ > +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENCE 0x00000080 > > /** > * kfd_ioctl_svm_op - SVM ioctl operations > -- > 2.34.1 >