On gfx943 APU, EXT_COHERENT should give MTYPE_CC for local and MTYPE_UC for nonlocal memory. On NUMA systems, local memory gets the local mtype, set by an override callback. If EXT_COHERENT is set, memory will be set as MTYPE_UC by default, with local memory MTYPE_CC. Add an option in the override function for this case, and add a check to ensure it is not used on UNCACHED memory. V2: Combined APU and NUMA code into one patch V3: Fixed a potential nullptr in amdgpu_vm_bo_update Signed-off-by: David Francis <David.Francis@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 +++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 8 +++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 33 +++++++++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 8 +++--- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d72daf15662f..155c04589753 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -761,6 +761,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, * @immediate: immediate submission in a page fault * @unlocked: unlocked invalidation during MM callback * @flush_tlb: trigger tlb invalidation after update completed + * @allow_override: change MTYPE for local NUMA nodes * @resv: fences we need to sync to * @start: start of mapped range * @last: last mapped entry @@ -777,7 +778,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, * 0 for success, negative erro code for failure. */ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, - bool immediate, bool unlocked, bool flush_tlb, + bool immediate, bool unlocked, bool flush_tlb, bool allow_override, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, @@ -815,6 +816,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, params.immediate = immediate; params.pages_addr = pages_addr; params.unlocked = unlocked; + params.allow_override = allow_override; /* Implicitly sync to command submissions in the same VM before * unmapping. Sync to moving fences before mapping. @@ -990,6 +992,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, struct ttm_resource *mem; struct dma_fence **last_update; bool flush_tlb = clear; + bool uncached; struct dma_resv *resv; uint64_t vram_base; uint64_t flags; @@ -1027,9 +1030,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); vram_base = bo_adev->vm_manager.vram_base_offset; + uncached = (bo->flags & AMDGPU_GEM_CREATE_UNCACHED) != 0; } else { flags = 0x0; vram_base = 0; + uncached = false; } if (clear || (bo && bo->tbo.base.resv == @@ -1063,7 +1068,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, trace_amdgpu_vm_bo_update(mapping); r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, - resv, mapping->start, mapping->last, + !uncached, resv, mapping->start, mapping->last, update_flags, mapping->offset, vram_base, mem, pages_addr, last_update); @@ -1258,8 +1263,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, mapping->start < AMDGPU_GMC_HOLE_START) init_pte_value = AMDGPU_PTE_DEFAULT_ATC; - r = amdgpu_vm_update_range(adev, vm, false, false, true, resv, - mapping->start, mapping->last, + r = amdgpu_vm_update_range(adev, vm, false, false, true, false, + resv, mapping->start, mapping->last, init_pte_value, 0, 0, NULL, NULL, &f); amdgpu_vm_free_mapping(adev, vm, mapping, f); @@ -2547,8 +2552,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, goto error_unlock; } - r = amdgpu_vm_update_range(adev, vm, true, false, false, NULL, addr, - addr, flags, value, 0, NULL, NULL, NULL); + r = amdgpu_vm_update_range(adev, vm, true, false, false, false, + NULL, addr, addr, flags, value, 0, NULL, NULL, NULL); if (r) goto error_unlock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 6e71978db13f..9ea8b5b644e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -244,6 +244,12 @@ struct amdgpu_vm_update_params { * @table_freed: return true if page table is freed when updating */ bool table_freed; + + /** + * @allow_override: true for memory that is not uncached: allows MTYPE + * to be overridden for NUMA local memory. + */ + bool allow_override; }; struct amdgpu_vm_update_funcs { @@ -436,7 +442,7 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, struct amdgpu_vm *vm, struct amdgpu_bo *bo); int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, - bool immediate, bool unlocked, bool flush_tlb, + bool immediate, bool unlocked, bool flush_tlb, bool allow_override, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 9b025fd17b84..a2287bb25223 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -843,7 +843,7 @@ static void amdgpu_vm_pte_update_flags(struct amdgpu_vm_update_params *params, */ if ((flags & AMDGPU_PTE_SYSTEM) && (adev->flags & AMD_IS_APU) && adev->gmc.gmc_funcs->override_vm_pte_flags && - num_possible_nodes() > 1 && !params->pages_addr) + num_possible_nodes() > 1 && !params->pages_addr && params->allow_override) amdgpu_gmc_override_vm_pte_flags(adev, params->vm, addr, &flags); params->vm->update_funcs->update(params, pt, pe, addr, count, incr, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index fee3141bb607..b66c5f7e1c56 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1251,12 +1251,15 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, return; } - /* Only override mappings with MTYPE_NC, which is the safe default for - * cacheable memory. + /* MTYPE_NC is the same default and can be overridden. + * MTYPE_UC will be present if the memory is extended-coherent + * and can also be overridden. */ if ((*flags & AMDGPU_PTE_MTYPE_VG10_MASK) != - AMDGPU_PTE_MTYPE_VG10(MTYPE_NC)) { - dev_dbg_ratelimited(adev->dev, "MTYPE is not NC\n"); + AMDGPU_PTE_MTYPE_VG10(MTYPE_NC) && + (*flags & AMDGPU_PTE_MTYPE_VG10_MASK) != + AMDGPU_PTE_MTYPE_VG10(MTYPE_UC)) { + dev_dbg_ratelimited(adev->dev, "MTYPE is not NC or UC\n"); return; } @@ -1283,15 +1286,23 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, vm->mem_id, local_node, nid); if (nid == local_node) { uint64_t old_flags = *flags; - unsigned int mtype_local = MTYPE_RW; + if ((*flags & AMDGPU_PTE_MTYPE_VG10_MASK) == + AMDGPU_PTE_MTYPE_VG10(MTYPE_NC)) { + unsigned int mtype_local = MTYPE_RW; - if (amdgpu_mtype_local == 1) - mtype_local = MTYPE_NC; - else if (amdgpu_mtype_local == 2) - mtype_local = MTYPE_CC; + if (amdgpu_mtype_local == 1) + mtype_local = MTYPE_NC; + else if (amdgpu_mtype_local == 2) + mtype_local = MTYPE_CC; + + *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | + AMDGPU_PTE_MTYPE_VG10(mtype_local); + } else { + /* MTYPE_UC case */ + *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | + AMDGPU_PTE_MTYPE_VG10(MTYPE_CC); + } - *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | - AMDGPU_PTE_MTYPE_VG10(mtype_local); dev_dbg_ratelimited(adev->dev, "flags updated from %llx to %llx\n", old_flags, *flags); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 640ada95e2c3..fe937670c927 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1280,7 +1280,7 @@ svm_range_get_pte_flags(struct kfd_node *node, if (num_possible_nodes() <= 1) mapping_flags |= mtype_local; else - mapping_flags |= AMDGPU_VM_MTYPE_NC; + mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the dGPU */ } else { mapping_flags |= AMDGPU_VM_MTYPE_UC; @@ -1315,7 +1315,7 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, pr_debug("[0x%llx 0x%llx]\n", start, last); - return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start, + return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start, last, init_pte_value, 0, 0, NULL, NULL, fence); } @@ -1422,8 +1422,8 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, * different memory partition based on fpfn/lpfn, we should use * same vm_manager.vram_base_offset regardless memory partition. */ - r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, - last_start, prange->start + i, + r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true, + NULL, last_start, prange->start + i, pte_flags, (last_start - prange->start) << PAGE_SHIFT, bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, -- 2.34.1