Handles the guest faults in KVM by mapping in corresponding user pages in the 2nd stage page tables. Introduces new ARM-specific kernel memory types, PAGE_KVM_GUEST and pgprot_guest variables used to map 2nd stage memory for KVM guests. Leverages MMU notifiers on KVM/ARM by supporting the kvm_unmap_hva() and kvm_set_spte_hva operations. All other KVM MMU notifierhooks are NOPs. Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> --- arch/arm/include/asm/kvm_arm.h | 9 + arch/arm/include/asm/kvm_asm.h | 3 arch/arm/include/asm/kvm_host.h | 16 ++ arch/arm/include/asm/pgtable-3level.h | 9 + arch/arm/include/asm/pgtable.h | 4 + arch/arm/kvm/Kconfig | 1 arch/arm/kvm/exports.c | 1 arch/arm/kvm/interrupts.S | 37 ++++++ arch/arm/kvm/mmu.c | 218 +++++++++++++++++++++++++++++++++ arch/arm/mm/mmu.c | 3 10 files changed, 300 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 0d1e895..7f6cad4 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -149,6 +149,15 @@ #define HSR_ISS (HSR_IL - 1) #define HSR_ISV_SHIFT (24) #define HSR_ISV (1U << HSR_ISV_SHIFT) +#define HSR_FSC (0x3f) +#define HSR_FSC_TYPE (0x3c) +#define HSR_WNR (1 << 6) + +#define FSC_FAULT (0x04) +#define FSC_PERM (0x0c) + +/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ +#define HPFAR_MASK (~0xf) #define HSR_EC_UNKNOWN (0x00) #define HSR_EC_WFI (0x01) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 58d51e3..e01dfab 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -34,6 +34,7 @@ #define SMCHYP_HVBAR_W 0xfffffff0 #ifndef __ASSEMBLY__ +struct kvm; struct kvm_vcpu; extern char __kvm_hyp_init[]; @@ -47,6 +48,8 @@ extern char __kvm_hyp_vector[]; extern char __kvm_hyp_code_start[]; extern char __kvm_hyp_code_end[]; +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); + extern void __kvm_flush_vm_context(void); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c58865b..0c7e782 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -140,4 +140,20 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; +#define KVM_ARCH_WANT_MMU_NOTIFIER +struct kvm; +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + +/* We do not have shadow page tables, hence the empty hooks */ +static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return 0; +} + +static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1169a8a..7351eee 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -102,6 +102,15 @@ */ #define L_PGD_SWAPPER (_AT(pgdval_t, 1) << 55) /* swapper_pg_dir entry */ +/* + * 2-nd stage PTE definitions for LPAE. + */ +#define L_PTE2_SHARED L_PTE_SHARED +#define L_PTE2_READ (_AT(pteval_t, 1) << 6) /* HAP[0] */ +#define L_PTE2_WRITE (_AT(pteval_t, 1) << 7) /* HAP[1] */ +#define L_PTE2_NORM_WB (_AT(pteval_t, 3) << 4) /* MemAttr[3:2] */ +#define L_PTE2_INNER_WB (_AT(pteval_t, 3) << 2) /* MemAttr[1:0] */ + #ifndef __ASSEMBLY__ #define pud_none(pud) (!pud_val(pud)) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index bc83540..a31d0e9 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -70,6 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t); extern pgprot_t pgprot_user; extern pgprot_t pgprot_kernel; +extern pgprot_t pgprot_guest; #define _MOD_PROT(p, b) __pgprot(pgprot_val(p) | (b)) @@ -83,6 +84,9 @@ extern pgprot_t pgprot_kernel; #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel #define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_USER) +#define PAGE_KVM_GUEST _MOD_PROT(pgprot_guest, L_PTE2_READ | \ + L_PTE2_NORM_WB | L_PTE2_INNER_WB | \ + L_PTE2_SHARED) #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN) #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 83abbe0..7fa50d3 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM_ARM_HOST depends on KVM depends on MMU depends on CPU_V7 && ARM_VIRT_EXT + select MMU_NOTIFIER ---help--- Provides host support for ARM processors. diff --git a/arch/arm/kvm/exports.c b/arch/arm/kvm/exports.c index 8ebdf07..f39f823 100644 --- a/arch/arm/kvm/exports.c +++ b/arch/arm/kvm/exports.c @@ -33,5 +33,6 @@ EXPORT_SYMBOL_GPL(__kvm_hyp_code_end); EXPORT_SYMBOL_GPL(__kvm_vcpu_run); EXPORT_SYMBOL_GPL(__kvm_flush_vm_context); +EXPORT_SYMBOL_GPL(__kvm_tlb_flush_vmid); EXPORT_SYMBOL_GPL(smp_send_reschedule); diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index a0e370b..fd7331c 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -36,9 +36,46 @@ __kvm_hyp_code_start: .globl __kvm_hyp_code_start @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Flush per-VMID TLBs +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +/* + * void __kvm_tlb_flush_vmid(struct kvm *kvm); + * + * We rely on the hardware to broadcast the TLB invalidation to all CPUs + * inside the inner-shareable domain (which is the case for all v7 + * implementations). If we come across a non-IS SMP implementation, we'll + * have to use an IPI based mechanism. Until then, we stick to the simple + * hardware assisted version. + */ +ENTRY(__kvm_tlb_flush_vmid) + hvc #0 @ Switch to Hyp mode + push {r2, r3} + + add r0, r0, #KVM_VTTBR + ldrd r2, r3, [r0] + mcrr p15, 6, r2, r3, c2 @ Write VTTBR + isb + mcr p15, 0, r0, c8, c3, 0 @ TLBIALLIS (rt ignored) + dsb + isb + mov r2, #0 + mov r3, #0 + mcrr p15, 6, r2, r3, c2 @ Back to VMID #0 + isb + + pop {r2, r3} + hvc #0 @ Back to SVC + bx lr +ENDPROC(__kvm_tlb_flush_vmid) + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ Flush TLBs and instruction caches of current CPU for all VMIDs @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +/* + * void __kvm_flush_vm_context(void); + */ ENTRY(__kvm_flush_vm_context) hvc #0 @ switch to hyp-mode diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ddfb3df..f3b0048 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -22,6 +22,7 @@ #include <asm/pgalloc.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_asm.h> static DEFINE_MUTEX(kvm_hyp_pgd_mutex); @@ -169,6 +170,9 @@ out: * Allocates the 1st level table only of size defined by PGD2_ORDER (can * support either full 40-bit input addresses or limited to 32-bit input * addresses). Clears the allocated pages. + * + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { @@ -229,6 +233,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all * underlying level-2 and level-3 tables before freeing the actual level-1 table * and setting the struct pointer to NULL. + * + * Note we don't need locking here as this is only called when the VM is + * destroyed, which can only be done once. */ void kvm_free_stage2_pgd(struct kvm *kvm) { @@ -264,7 +271,216 @@ void kvm_free_stage2_pgd(struct kvm *kvm) kvm->arch.pgd = NULL; } +static const pte_t null_pte; + +static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, + const pte_t *new_pte) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Create 2nd stage page table mapping - Level 1 */ + pgd = kvm->arch.pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + BUG_ON(new_pte == &null_pte); + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate 2nd stage pmd\n"); + return -ENOMEM; + } + pud_populate(NULL, pud, pmd); + pmd += pmd_index(addr); + } else + pmd = pmd_offset(pud, addr); + + /* Create 2nd stage page table mapping - Level 2 */ + if (pmd_none(*pmd)) { + BUG_ON(new_pte == &null_pte); + pte = pte_alloc_one_kernel(NULL, addr); + if (!pte) { + kvm_err("Cannot allocate 2nd stage pte\n"); + return -ENOMEM; + } + pmd_populate_kernel(NULL, pmd, pte); + pte += pte_index(addr); + } else + pte = pte_offset_kernel(pmd, addr); + + /* Create 2nd stage page table mapping - Level 3 */ + set_pte_ext(pte, *new_pte, 0); + + return 0; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + gfn_t gfn, struct kvm_memory_slot *memslot, + bool is_iabt) +{ + pte_t new_pte; + pfn_t pfn; + int ret; + bool write_fault, writable; + + /* TODO: Use instr. decoding for non-ISV to determine r/w fault */ + if (is_iabt) + write_fault = false; + else if ((vcpu->arch.hsr & HSR_ISV) && !(vcpu->arch.hsr & HSR_WNR)) + write_fault = false; + else + write_fault = true; + + if ((vcpu->arch.hsr & HSR_FSC_TYPE) == FSC_PERM && !write_fault) { + kvm_err("Unexpected L2 read permission error\n"); + return -EFAULT; + } + + /* preemption disabled for handle_exit, gfn_to_pfn may sleep */ + preempt_enable(); + pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); + preempt_disable(); + + if (is_error_pfn(pfn)) { + put_page(pfn_to_page(pfn)); + kvm_err("No host mapping: gfn %u (0x%08x)\n", + (unsigned int)gfn, + (unsigned int)gfn << PAGE_SHIFT); + return -EFAULT; + } + + mutex_lock(&vcpu->kvm->arch.pgd_mutex); + new_pte = pfn_pte(pfn, PAGE_KVM_GUEST); + if (writable) + new_pte |= L_PTE2_WRITE; + ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte); + if (ret) + put_page(pfn_to_page(pfn)); + mutex_unlock(&vcpu->kvm->arch.pgd_mutex); + + return ret; +} + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * @run: the kvm_run structure + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) { - return -EINVAL; + unsigned long hsr_ec; + unsigned long fault_status; + phys_addr_t fault_ipa; + struct kvm_memory_slot *memslot = NULL; + bool is_iabt; + gfn_t gfn; + + hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT; + is_iabt = (hsr_ec == HSR_EC_IABT); + + /* Check that the second stage fault is a translation fault */ + fault_status = (vcpu->arch.hsr & HSR_FSC_TYPE); + if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { + kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n", + hsr_ec, fault_status); + return -EFAULT; + } + + fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8; + + gfn = fault_ipa >> PAGE_SHIFT; + if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { + if (is_iabt) { + kvm_err("Inst. abort on I/O address %08lx\n", + (unsigned long)fault_ipa); + return -EFAULT; + } + + kvm_pr_unimpl("I/O address abort..."); + return 0; + } + + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot->user_alloc) { + kvm_err("non user-alloc memslots not supported\n"); + return -EINVAL; + } + + return user_mem_abort(vcpu, fault_ipa, gfn, memslot, is_iabt); +} + +static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool found = false; + + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gpa_t gpa_offset = hva - start; + *gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset; + found = true; + /* no overlapping memslots allowed: break */ + break; + } + } + + mutex_unlock(&kvm->slots_lock); + return found; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + bool found; + gpa_t gpa; + + if (!kvm->arch.pgd) + return 0; + + mutex_lock(&kvm->arch.pgd_mutex); + found = hva_to_gpa(kvm, hva, &gpa); + if (found) { + stage2_set_pte(kvm, gpa, &null_pte); + __kvm_tlb_flush_vmid(kvm); + } + mutex_unlock(&kvm->arch.pgd_mutex); + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + gpa_t gpa; + bool found; + + if (!kvm->arch.pgd) + return; + + mutex_lock(&kvm->arch.pgd_mutex); + found = hva_to_gpa(kvm, hva, &gpa); + if (found) { + stage2_set_pte(kvm, gpa, &pte); + /* + * Ignore return code from stage2_set_pte, since -ENOMEM would + * indicate this IPA is is not mapped and there is no harm + * that the PTE changed. + */ + __kvm_tlb_flush_vmid(kvm); + } + mutex_unlock(&kvm->arch.pgd_mutex); } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index f7439e7..7dd4b54 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -56,9 +56,11 @@ static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK; static unsigned int ecc_mask __initdata = 0; pgprot_t pgprot_user; pgprot_t pgprot_kernel; +pgprot_t pgprot_guest; EXPORT_SYMBOL(pgprot_user); EXPORT_SYMBOL(pgprot_kernel); +EXPORT_SYMBOL(pgprot_guest); struct cachepolicy { const char policy[16]; @@ -520,6 +522,7 @@ static void __init build_mem_type_table(void) pgprot_user = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot); pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | kern_pgprot); + pgprot_guest = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG); mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask; mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html