[AMD Official Use Only - General] Reviewed-by: David Yat Sin <David.YatSin@xxxxxxx> > -----Original Message----- > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Sent: Friday, July 28, 2023 4:00 PM > To: Francis, David <David.Francis@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; > Yat Sin, David <David.YatSin@xxxxxxx> > Subject: Re: [PATCH v3] drm/amdgpu: Add EXT_COHERENT memory allocation > flags > > On 2023-07-28 15:39, David Francis wrote: > > These flags (for GEM and SVM allocations) allocate memory that allows > > for system-scope atomic semantics. > > > > On GFX943 these flags cause caches to be avoided on non-local memory. > > > > On all other ASICs they are identical in functionality to the > > equivalent COHERENT flags. > > > > Corresponding Thunk patch is at > > https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88 > > > > v3: changed name of flag > > > > Signed-off-by: David Francis <David.Francis@xxxxxxx> > > I made one comment on the user mode patch regarding the explicit handling > of invalid combinations of Uncached, Coherent, ExtCoherent flags. I'm not > sure what we agreed on any more. But I don't think we want to just leave it up > to chance. Other than that, this patch looks good to me. > > Regards, > Felix > > > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 ++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 1 + > > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 1 + > > drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 1 + > > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 ++++- > > drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +++++++++- > > include/uapi/drm/amdgpu_drm.h | 10 +++++++++- > > include/uapi/linux/kfd_ioctl.h | 3 +++ > > 8 files changed, 30 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > index d34c3ef8f3ed..a1ce261f2d06 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > @@ -1738,6 +1738,8 @@ int > amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( > > > > if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) > > alloc_flags |= AMDGPU_GEM_CREATE_COHERENT; > > + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT) > > + alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT; > > if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) > > alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED; > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > > index 12210598e5b8..76b618735dc0 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > > @@ -331,6 +331,7 @@ amdgpu_dma_buf_create_obj(struct drm_device > *dev, > > struct dma_buf *dma_buf) > > > > flags |= other->flags & > (AMDGPU_GEM_CREATE_CPU_GTT_USWC | > > AMDGPU_GEM_CREATE_COHERENT > | > > + > AMDGPU_GEM_CREATE_EXT_COHERENT | > > > AMDGPU_GEM_CREATE_UNCACHED); > > } > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > > index 6b430e10d38e..301ffe30824f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > > @@ -632,6 +632,7 @@ static void gmc_v10_0_get_vm_pte(struct > amdgpu_device *adev, > > } > > > > if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > > + AMDGPU_GEM_CREATE_EXT_COHERENT | > > AMDGPU_GEM_CREATE_UNCACHED)) > > *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | > > AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git > > a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > > b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > > index a6ee0220db56..846894e212e7 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c > > @@ -540,6 +540,7 @@ static void gmc_v11_0_get_vm_pte(struct > amdgpu_device *adev, > > } > > > > if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > > + AMDGPU_GEM_CREATE_EXT_COHERENT | > > AMDGPU_GEM_CREATE_UNCACHED)) > > *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | > > AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git > > a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > index 880460cd3239..92a623e130d9 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > > @@ -1183,7 +1183,8 @@ static void gmc_v9_0_get_coherence_flags(struct > amdgpu_device *adev, > > { > > struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); > > bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; > > - bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; > > + bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | > AMDGPU_GEM_CREATE_EXT_COHERENT); > > + bool ext_coherent = bo->flags & > AMDGPU_GEM_CREATE_EXT_COHERENT; > > bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; > > struct amdgpu_vm *vm = mapping->bo_va->base.vm; > > unsigned int mtype_local, mtype; > > @@ -1251,6 +1252,8 @@ static void gmc_v9_0_get_coherence_flags(struct > amdgpu_device *adev, > > snoop = true; > > if (uncached) { > > mtype = MTYPE_UC; > > + } else if (ext_coherent) { > > + mtype = is_local ? MTYPE_CC : MTYPE_UC; > > } else if (adev->flags & AMD_IS_APU) { > > mtype = is_local ? mtype_local : MTYPE_NC; > > } else { > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > > index 1b50eae051a4..e9ffcfc5dedc 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > > @@ -1155,7 +1155,8 @@ svm_range_get_pte_flags(struct kfd_node > *node, > > uint32_t mapping_flags = 0; > > uint64_t pte_flags; > > bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); > > - bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; > > + bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | > KFD_IOCTL_SVM_FLAG_EXT_COHERENT); > > + bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; > > bool uncached = false; /*flags & > KFD_IOCTL_SVM_FLAG_UNCACHED;*/ > > unsigned int mtype_local; > > > > @@ -1203,6 +1204,13 @@ svm_range_get_pte_flags(struct kfd_node > *node, > > snoop = true; > > if (uncached) { > > mapping_flags |= AMDGPU_VM_MTYPE_UC; > > + } else if (ext_coherent) { > > + /* local HBM region close to partition */ > > + if (bo_node->adev == node->adev && > > + (!bo_node->xcp || !node->xcp || bo_node->xcp- > >mem_id == node->xcp->mem_id)) > > + mapping_flags |= AMDGPU_VM_MTYPE_CC; > > + else > > + mapping_flags |= AMDGPU_VM_MTYPE_UC; > > } else if (domain == SVM_RANGE_VRAM_DOMAIN) { > > /* local HBM region close to partition */ > > if (bo_node->adev == node->adev && diff --git > > a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index > > 79b14828d542..629860dbc9ec 100644 > > --- a/include/uapi/drm/amdgpu_drm.h > > +++ b/include/uapi/drm/amdgpu_drm.h > > @@ -145,7 +145,7 @@ extern "C" { > > */ > > #define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) > > /* Flag that BO is shared coherently between multiple devices or CPU > threads. > > - * May depend on GPU instructions to flush caches explicitly > > + * May depend on GPU instructions to flush caches to system scope > explicitly. > > * > > * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs > and > > * may override the MTYPE selected in AMDGPU_VA_OP_MAP. > > @@ -158,6 +158,14 @@ extern "C" { > > * may override the MTYPE selected in AMDGPU_VA_OP_MAP. > > */ > > #define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) > > +/* Flag that BO should be coherent across devices when using > > +device-level > > + * atomics. May depend on GPU instructions to flush caches to device > > +scope > > + * explicitly, promoting them to system scope automatically. > > + * > > + * This influences the choice of MTYPE in the PTEs on GFXv9 and later > > +GPUs and > > + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. > > + */ > > +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) > > > > struct drm_amdgpu_gem_create_in { > > /** the requested memory size */ > > diff --git a/include/uapi/linux/kfd_ioctl.h > > b/include/uapi/linux/kfd_ioctl.h index eeb2fdcbdcb7..f0ed68974c54 > > 100644 > > --- a/include/uapi/linux/kfd_ioctl.h > > +++ b/include/uapi/linux/kfd_ioctl.h > > @@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args { > > #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) > > #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) > > #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) > > +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) > > > > /* Allocate memory for later SVM (shared virtual memory) mapping. > > * > > @@ -659,6 +660,8 @@ enum kfd_mmio_remap { > > #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 > > /* Keep GPU memory mapping always valid as if XNACK is disable */ > > #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 > > +/* Fine grained coherency between all devices using device-scope atomics > */ > > +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080 > > > > /** > > * kfd_ioctl_svm_op - SVM ioctl operations