[PATCH v2 1/1] KVM: arm64: Allow cacheable stage 2 mapping using VMA flags

<ankita@xxxxxxxxxx> · Mon, 18 Nov 2024 13:19:58 +0000

From: Ankit Agrawal <ankita@xxxxxxxxxx>

Currently KVM determines if a VMA is pointing at IO memory by checking
pfn_is_map_memory(). However, the MM already gives us a way to tell what
kind of memory it is by inspecting the VMA.

This patch solves the problems where it is possible for the kernel to
have VMAs pointing at cachable memory without causing
pfn_is_map_memory() to be true, eg DAX memremap cases and CXL/pre-CXL
devices. This memory is now properly marked as cachable in KVM.

The pfn_is_map_memory() is restrictive and allows only for the memory
that is added to the kernel to be marked as cacheable. In most cases
the code needs to know if there is a struct page, or if the memory is
in the kernel map and pfn_valid() is an appropriate API for this.
Extend the umbrella with pfn_valid() to include memory with no struct
pages for consideration to be mapped cacheable in stage 2. A !pfn_valid()
implies that the memory is unsafe to be mapped as cacheable.

Moreover take account of the mapping type in the VMA to make a decision
on the mapping. The VMA's pgprot is tested to determine the memory type
with the following mapping:
 pgprot_noncached    MT_DEVICE_nGnRnE   device (or Normal_NC)
 pgprot_writecombine MT_NORMAL_NC       device (or Normal_NC)
 pgprot_device       MT_DEVICE_nGnRE    device (or Normal_NC)
 pgprot_tagged       MT_NORMAL_TAGGED   RAM / Normal
 -                   MT_NORMAL          RAM / Normal

Also take care of the following two cases that prevents the memory to
be safely mapped as cacheable:
1. The VMA pgprot have VM_IO set alongwith MT_NORMAL or
   MT_NORMAL_TAGGED. Although unexpected and wrong, presence of such
   configuration cannot be ruled out.
2. Configurations where VM_MTE_ALLOWED is not set and KVM_CAP_ARM_MTE
   is enabled. Otherwise a malicious guest can enable MTE at stage 1
   without the hypervisor being able to tell. This could cause external
   aborts.

Introduce a new variable noncacheable to represent whether the memory
should not be mapped as cacheable. The noncacheable as false implies
the memory is safe to be mapped cacheable. Use this to handle the
aforementioned potentially unsafe cases for cacheable mapping.

Note when FWB is not enabled, the kernel expects to trivially do
cache management by flushing the memory by linearly converting a
kvm_pte to phys_addr to a KVA, see kvm_flush_dcache_to_poc(). This is
only possibile for struct page backed memory. Do not allow non-struct
page memory to be cachable without FWB.

The device memory such as on the Grace Hopper systems is interchangeable
with DDR memory and retains its properties. Allow executable faults
on the memory determined as Normal cacheable.

Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
Suggested-by: Catalin Marinas <catalin.marinas@xxxxxxx>
Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
---
 arch/arm64/include/asm/kvm_pgtable.h |   8 +++
 arch/arm64/kvm/hyp/pgtable.c         |   2 +-
 arch/arm64/kvm/mmu.c                 | 101 +++++++++++++++++++++------
 3 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index aab04097b505..c41ba415c5e4 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -502,6 +502,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
  */
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
 
+/**
+ * stage2_has_fwb() - Determine whether FWB is supported
+ * @pgt:    Page-table structure initialised by kvm_pgtable_stage2_init*()
+ *
+ * Return: True if FWB is supported.
+ */
+bool stage2_has_fwb(struct kvm_pgtable *pgt);
+
 /**
  * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
  * @vtcr:	Content of the VTCR register.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 40bd55966540..4417651e2b1c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -642,7 +642,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 	return vtcr;
 }
 
-static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+bool stage2_has_fwb(struct kvm_pgtable *pgt)
 {
 	if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		return false;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c9d46ad57e52..9bf5c3e89d6d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -180,11 +180,6 @@ int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
 	return 0;
 }
 
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
-	return !pfn_is_map_memory(pfn);
-}
-
 static void *stage2_memcache_zalloc_page(void *arg)
 {
 	struct kvm_mmu_memory_cache *mc = arg;
@@ -1430,6 +1425,23 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+/*
+ * Determine the memory region cacheability from VMA's pgprot. This
+ * is used to set the stage 2 PTEs.
+ */
+static unsigned long mapping_type(pgprot_t page_prot)
+{
+	return FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(page_prot));
+}
+
+/*
+ * Determine if the mapping type is normal cacheable.
+ */
+static bool mapping_type_normal_cacheable(unsigned long mt)
+{
+	return (mt == MT_NORMAL || mt == MT_NORMAL_TAGGED);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_s2_trans *nested,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1438,8 +1450,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	int ret = 0;
 	bool write_fault, writable, force_pte = false;
 	bool exec_fault, mte_allowed;
-	bool device = false, vfio_allow_any_uc = false;
+	bool noncacheable = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
+	unsigned long mt;
 	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1568,6 +1581,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
 
+	mt = mapping_type(vma->vm_page_prot);
+
+	/*
+	 * Check for potentially ineligible or unsafe conditions for
+	 * cacheable mappings.
+	 */
+	if (vma->vm_flags & VM_IO)
+		noncacheable = true;
+	else if (!mte_allowed && kvm_has_mte(kvm))
+		noncacheable = true;
+
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
@@ -1591,19 +1615,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_noslot_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn)) {
-		/*
-		 * If the page was identified as device early by looking at
-		 * the VMA flags, vma_pagesize is already representing the
-		 * largest quantity we can map.  If instead it was mapped
-		 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
-		 * and must not be upgraded.
-		 *
-		 * In both cases, we don't let transparent_hugepage_adjust()
-		 * change things at the last minute.
-		 */
-		device = true;
-	} else if (logging_active && !write_fault) {
+	/*
+	 * pfn_valid() indicates to the code if there is a struct page, or
+	 * if the memory is in the kernel map. Any memory region otherwise
+	 * is unsafe to be cacheable.
+	 */
+	if (!pfn_valid(pfn))
+		noncacheable = true;
+
+	if (!noncacheable && logging_active && !write_fault) {
 		/*
 		 * Only actually map the page as writable if this was a write
 		 * fault.
@@ -1611,7 +1631,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		writable = false;
 	}
 
-	if (exec_fault && device)
+	/*
+	 * Do not allow exec fault; unless the memory is determined safely
+	 * to be Normal cacheable.
+	 */
+	if (exec_fault && (noncacheable || !mapping_type_normal_cacheable(mt)))
 		return -ENOEXEC;
 
 	/*
@@ -1641,10 +1665,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	}
 
 	/*
+	 * If the page was identified as device early by looking at
+	 * the VMA flags, vma_pagesize is already representing the
+	 * largest quantity we can map.  If instead it was mapped
+	 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+	 * and must not be upgraded.
+	 *
+	 * In both cases, we don't let transparent_hugepage_adjust()
+	 * change things at the last minute.
+	 *
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+	if (vma_pagesize == PAGE_SIZE && !(force_pte || noncacheable)) {
 		if (fault_is_perm && fault_granule > PAGE_SIZE)
 			vma_pagesize = fault_granule;
 		else
@@ -1658,7 +1691,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		}
 	}
 
-	if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
+	if (!fault_is_perm && !noncacheable && kvm_has_mte(kvm)) {
 		/* Check the VMM hasn't introduced a new disallowed VMA */
 		if (mte_allowed) {
 			sanitise_mte_tags(kvm, pfn, vma_pagesize);
@@ -1674,7 +1707,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
 
-	if (device) {
+	/*
+	 * If any of the following pgprot modifiers are applied on the pgprot,
+	 * consider as device memory and map in Stage 2 as device or
+	 * Normal noncached:
+	 * pgprot_noncached
+	 * pgprot_writecombine
+	 * pgprot_device
+	 */
+	if (!mapping_type_normal_cacheable(mt)) {
 		if (vfio_allow_any_uc)
 			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
 		else
@@ -1684,6 +1725,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		prot |= KVM_PGTABLE_PROT_X;
 	}
 
+	/*
+	 *  When FWB is unsupported KVM needs to do cache flushes
+	 *  (via dcache_clean_inval_poc()) of the underlying memory. This is
+	 *  only possible if the memory is already mapped into the kernel map
+	 *  at the usual spot.
+	 *
+	 *  Validate that there is a struct page for the PFN which maps
+	 *  to the KVA that the flushing code expects.
+	 */
+	if (!stage2_has_fwb(pgt) && !(pfn_valid(pfn))) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	/*
 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
 	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.34.1