From: Ankit Agrawal <ankita@xxxxxxxxxx> Currently KVM determines if a VMA is pointing at IO memory by checking pfn_is_map_memory(). However, the MM already gives us a way to tell what kind of memory it is by inspecting the VMA. This patch solves the problems where it is possible for the kernel to have VMAs pointing at cachable memory without causing pfn_is_map_memory() to be true, eg DAX memremap cases and CXL/pre-CXL devices. This memory is now properly marked as cachable in KVM. The pfn_is_map_memory() is restrictive and allows only for the memory that is added to the kernel to be marked as cacheable. In most cases the code needs to know if there is a struct page, or if the memory is in the kernel map and pfn_valid() is an appropriate API for this. Extend the umbrella with pfn_valid() to include memory with no struct pages for consideration to be mapped cacheable in stage 2. A !pfn_valid() implies that the memory is unsafe to be mapped as cacheable. Moreover take account of the mapping type in the VMA to make a decision on the mapping. The VMA's pgprot is tested to determine the memory type with the following mapping: pgprot_noncached MT_DEVICE_nGnRnE device (or Normal_NC) pgprot_writecombine MT_NORMAL_NC device (or Normal_NC) pgprot_device MT_DEVICE_nGnRE device (or Normal_NC) pgprot_tagged MT_NORMAL_TAGGED RAM / Normal - MT_NORMAL RAM / Normal Also take care of the following two cases that prevents the memory to be safely mapped as cacheable: 1. The VMA pgprot have VM_IO set alongwith MT_NORMAL or MT_NORMAL_TAGGED. Although unexpected and wrong, presence of such configuration cannot be ruled out. 2. Configurations where VM_MTE_ALLOWED is not set and KVM_CAP_ARM_MTE is enabled. Otherwise a malicious guest can enable MTE at stage 1 without the hypervisor being able to tell. This could cause external aborts. Introduce a new variable noncacheable to represent whether the memory should not be mapped as cacheable. The noncacheable as false implies the memory is safe to be mapped cacheable. Use this to handle the aforementioned potentially unsafe cases for cacheable mapping. Note when FWB is not enabled, the kernel expects to trivially do cache management by flushing the memory by linearly converting a kvm_pte to phys_addr to a KVA, see kvm_flush_dcache_to_poc(). This is only possibile for struct page backed memory. Do not allow non-struct page memory to be cachable without FWB. The device memory such as on the Grace Hopper systems is interchangeable with DDR memory and retains its properties. Allow executable faults on the memory determined as Normal cacheable. Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> Suggested-by: Catalin Marinas <catalin.marinas@xxxxxxx> Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx> --- arch/arm64/include/asm/kvm_pgtable.h | 8 +++ arch/arm64/kvm/hyp/pgtable.c | 2 +- arch/arm64/kvm/mmu.c | 101 +++++++++++++++++++++------ 3 files changed, 87 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index aab04097b505..c41ba415c5e4 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -502,6 +502,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); */ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); +/** + * stage2_has_fwb() - Determine whether FWB is supported + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*() + * + * Return: True if FWB is supported. + */ +bool stage2_has_fwb(struct kvm_pgtable *pgt); + /** * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD * @vtcr: Content of the VTCR register. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 40bd55966540..4417651e2b1c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -642,7 +642,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) return vtcr; } -static bool stage2_has_fwb(struct kvm_pgtable *pgt) +bool stage2_has_fwb(struct kvm_pgtable *pgt) { if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return false; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9d46ad57e52..9bf5c3e89d6d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -180,11 +180,6 @@ int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, return 0; } -static bool kvm_is_device_pfn(unsigned long pfn) -{ - return !pfn_is_map_memory(pfn); -} - static void *stage2_memcache_zalloc_page(void *arg) { struct kvm_mmu_memory_cache *mc = arg; @@ -1430,6 +1425,23 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +/* + * Determine the memory region cacheability from VMA's pgprot. This + * is used to set the stage 2 PTEs. + */ +static unsigned long mapping_type(pgprot_t page_prot) +{ + return FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(page_prot)); +} + +/* + * Determine if the mapping type is normal cacheable. + */ +static bool mapping_type_normal_cacheable(unsigned long mt) +{ + return (mt == MT_NORMAL || mt == MT_NORMAL_TAGGED); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, @@ -1438,8 +1450,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault, mte_allowed; - bool device = false, vfio_allow_any_uc = false; + bool noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; + unsigned long mt; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1568,6 +1581,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + mt = mapping_type(vma->vm_page_prot); + + /* + * Check for potentially ineligible or unsafe conditions for + * cacheable mappings. + */ + if (vma->vm_flags & VM_IO) + noncacheable = true; + else if (!mte_allowed && kvm_has_mte(kvm)) + noncacheable = true; + /* Don't use the VMA after the unlock -- it may have vanished */ vma = NULL; @@ -1591,19 +1615,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_noslot_pfn(pfn)) return -EFAULT; - if (kvm_is_device_pfn(pfn)) { - /* - * If the page was identified as device early by looking at - * the VMA flags, vma_pagesize is already representing the - * largest quantity we can map. If instead it was mapped - * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE - * and must not be upgraded. - * - * In both cases, we don't let transparent_hugepage_adjust() - * change things at the last minute. - */ - device = true; - } else if (logging_active && !write_fault) { + /* + * pfn_valid() indicates to the code if there is a struct page, or + * if the memory is in the kernel map. Any memory region otherwise + * is unsafe to be cacheable. + */ + if (!pfn_valid(pfn)) + noncacheable = true; + + if (!noncacheable && logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write * fault. @@ -1611,7 +1631,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, writable = false; } - if (exec_fault && device) + /* + * Do not allow exec fault; unless the memory is determined safely + * to be Normal cacheable. + */ + if (exec_fault && (noncacheable || !mapping_type_normal_cacheable(mt))) return -ENOEXEC; /* @@ -1641,10 +1665,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* + * If the page was identified as device early by looking at + * the VMA flags, vma_pagesize is already representing the + * largest quantity we can map. If instead it was mapped + * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * and must not be upgraded. + * + * In both cases, we don't let transparent_hugepage_adjust() + * change things at the last minute. + * * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (vma_pagesize == PAGE_SIZE && !(force_pte || noncacheable)) { if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else @@ -1658,7 +1691,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } } - if (!fault_is_perm && !device && kvm_has_mte(kvm)) { + if (!fault_is_perm && !noncacheable && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1674,7 +1707,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) prot |= KVM_PGTABLE_PROT_X; - if (device) { + /* + * If any of the following pgprot modifiers are applied on the pgprot, + * consider as device memory and map in Stage 2 as device or + * Normal noncached: + * pgprot_noncached + * pgprot_writecombine + * pgprot_device + */ + if (!mapping_type_normal_cacheable(mt)) { if (vfio_allow_any_uc) prot |= KVM_PGTABLE_PROT_NORMAL_NC; else @@ -1684,6 +1725,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_X; } + /* + * When FWB is unsupported KVM needs to do cache flushes + * (via dcache_clean_inval_poc()) of the underlying memory. This is + * only possible if the memory is already mapped into the kernel map + * at the usual spot. + * + * Validate that there is a struct page for the PFN which maps + * to the KVA that the flushing code expects. + */ + if (!stage2_has_fwb(pgt) && !(pfn_valid(pfn))) { + ret = -EINVAL; + goto out_unlock; + } + /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, -- 2.34.1