From: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> Handles the guest faults in KVM by mapping in corresponding user pages in the 2nd stage page tables. Introduces new ARM-specific kernel memory types, PAGE_KVM_GUEST and pgprot_guest variables used to map 2nd stage memory for KVM guests. Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> --- arch/arm/include/asm/pgtable-3level.h | 8 ++ arch/arm/include/asm/pgtable.h | 4 + arch/arm/kvm/mmu.c | 127 +++++++++++++++++++++++++++++++++ arch/arm/mm/mmu.c | 3 + 4 files changed, 141 insertions(+), 1 deletions(-) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index edc3cb9..6dc5331 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -104,6 +104,14 @@ */ #define L_PGD_SWAPPER (_AT(pgdval_t, 1) << 55) /* swapper_pg_dir entry */ +/* + * 2-nd stage PTE definitions for LPAE. + */ +#define L_PTE2_READ (_AT(pteval_t, 1) << 6) /* HAP[0] */ +#define L_PTE2_WRITE (_AT(pteval_t, 1) << 7) /* HAP[1] */ +#define L_PTE2_NORM_WB (_AT(pteval_t, 3) << 4) /* MemAttr[3:2] */ +#define L_PTE2_INNER_WB (_AT(pteval_t, 3) << 2) /* MemAttr[1:0] */ + #ifndef __ASSEMBLY__ #define pud_none(pud) (!pud_val(pud)) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 4b72287..2561a8b 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -70,6 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t); extern pgprot_t pgprot_user; extern pgprot_t pgprot_kernel; +extern pgprot_t pgprot_guest; #define _MOD_PROT(p, b) __pgprot(pgprot_val(p) | (b)) @@ -83,6 +84,9 @@ extern pgprot_t pgprot_kernel; #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel #define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_USER) +#define PAGE_KVM_GUEST _MOD_PROT(pgprot_guest, L_PTE2_READ | \ + L_PTE2_WRITE | L_PTE2_NORM_WB | \ + L_PTE2_INNER_WB) #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN) #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 94411a6..eb91da2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -169,6 +169,9 @@ out: * Allocates the 1st level table only of size defined by PGD2_ORDER (can * support either full 40-bit input addresses or limited to 32-bit input * addresses). Clears the allocated pages. + * + * Note we don't need locking here as this is only called when the VM is + * destroyed, which can only be done once. */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { @@ -230,6 +233,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all * underlying level-2 and level-3 tables before freeing the actual level-1 table * and setting the struct pointer to NULL. + * + * Note we don't need locking here as this is only called when the VM is + * destroyed, which can only be done once. */ void kvm_free_stage2_pgd(struct kvm *kvm) { @@ -265,7 +271,126 @@ void kvm_free_stage2_pgd(struct kvm *kvm) kvm->arch.pgd = NULL; } +static int __user_mem_abort(struct kvm *kvm, phys_addr_t addr, pfn_t pfn) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + /* Create 2nd stage page table mapping - Level 1 */ + pgd = kvm->arch.pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + put_page(pfn_to_page(pfn)); + kvm_err("Cannot allocate 2nd stage pmd\n"); + return -ENOMEM; + } + pud_populate(NULL, pud, pmd); + pmd += pmd_index(addr); + } else + pmd = pmd_offset(pud, addr); + + /* Create 2nd stage page table mapping - Level 2 */ + if (pmd_none(*pmd)) { + pte = pte_alloc_one_kernel(NULL, addr); + if (!pte) { + put_page(pfn_to_page(pfn)); + kvm_err("Cannot allocate 2nd stage pte\n"); + return -ENOMEM; + } + pmd_populate_kernel(NULL, pmd, pte); + pte += pte_index(addr); + } else + pte = pte_offset_kernel(pmd, addr); + + /* Create 2nd stage page table mapping - Level 3 */ + new_pte = pfn_pte(pfn, PAGE_KVM_GUEST); + set_pte_ext(pte, new_pte, 0); + + return 0; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + gfn_t gfn, struct kvm_memory_slot *memslot) +{ + pfn_t pfn; + int ret; + + pfn = gfn_to_pfn(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + put_page(pfn_to_page(pfn)); + kvm_err("Guest gfn %u (0x%08x) does not have \n" + "corresponding host mapping", + (unsigned int)gfn, + (unsigned int)gfn << PAGE_SHIFT); + return -EFAULT; + } + + mutex_lock(&vcpu->kvm->arch.pgd_mutex); + ret = __user_mem_abort(vcpu->kvm, fault_ipa, pfn); + mutex_unlock(&vcpu->kvm->arch.pgd_mutex); + + return ret; +} + +#define HSR_ABT_FS (0x3f) +#define HPFAR_MASK (~0xf) + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * @run: the kvm_run structure + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) { - return -EINVAL; + unsigned long hsr_ec; + unsigned long fault_status; + phys_addr_t fault_ipa; + struct kvm_memory_slot *memslot = NULL; + bool is_iabt; + gfn_t gfn; + + hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT; + is_iabt = (hsr_ec == HSR_EC_IABT); + + /* Check that the second stage fault is a translation fault */ + fault_status = vcpu->arch.hsr & HSR_ABT_FS; + if ((fault_status & 0x3c) != 0x4) { + kvm_err("Unsupported fault status: %lx\n", + fault_status & 0x3c); + return -EFAULT; + } + + fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8; + + gfn = fault_ipa >> PAGE_SHIFT; + if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { + if (is_iabt) { + kvm_err("Inst. abort on I/O address %08lx\n", + (unsigned long)fault_ipa); + return -EFAULT; + } + + kvm_pr_unimpl("I/O address abort..."); + return 0; + } + + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot->user_alloc) { + kvm_err("non user-alloc memslots not supported\n"); + return -EINVAL; + } + + return user_mem_abort(vcpu, fault_ipa, gfn, memslot); } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 94c5a0c..1a2df14 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -54,9 +54,11 @@ static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK; static unsigned int ecc_mask __initdata = 0; pgprot_t pgprot_user; pgprot_t pgprot_kernel; +pgprot_t pgprot_guest; EXPORT_SYMBOL(pgprot_user); EXPORT_SYMBOL(pgprot_kernel); +EXPORT_SYMBOL(pgprot_guest); struct cachepolicy { const char policy[16]; @@ -504,6 +506,7 @@ static void __init build_mem_type_table(void) pgprot_user = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot); pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | kern_pgprot); + pgprot_guest = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG); mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask; mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html