On 16.08.2012, at 17:29, Christoffer Dall wrote: > This commit introduces the framework for guest memory management > through the use of 2nd stage translation. Each VM has a pointer > to a level-1 table (the pgd field in struct kvm_arch) which is > used for the 2nd stage translations. Entries are added when handling > guest faults (later patch) and the table itself can be allocated and > freed through the following functions implemented in > arch/arm/kvm/arm_mmu.c: > - kvm_alloc_stage2_pgd(struct kvm *kvm); > - kvm_free_stage2_pgd(struct kvm *kvm); > > Introduces new ARM-specific kernel memory types, PAGE_KVM_GUEST and > pgprot_guest variables used to map 2nd stage memory for KVM guests. > > Each entry in TLBs and caches are tagged with a VMID identifier in > addition to ASIDs. The VMIDs are assigned consecutively to VMs in the > order that VMs are executed, and caches and tlbs are invalidated when > the VMID space has been used to allow for more than 255 simultaenously > running guests. > > The 2nd stage pgd is allocated in kvm_arch_init_vm(). The table is > freed in kvm_arch_destroy_vm(). Both functions are called from the main > KVM code. > > We pre-allocate page table memory to be able to synchronize using a > spinlock and be called under rcu_read_lock from the MMU notifiers. We > steal the mmu_memory_cache implementation from x86 and adapt for our > specific usage. > > We support MMU notifiers (thanks to Marc Zyngier) through > kvm_unmap_hva and kvm_set_spte_hva. > > Finally, define kvm_phys_addr_ioremap() to map a device at a guest IPA, > which is used by VGIC support to map the virtual CPU interface registers > to the guest. This support is added by Marc Zyngier. > > Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> > Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> > --- > arch/arm/include/asm/kvm_asm.h | 2 > arch/arm/include/asm/kvm_host.h | 18 ++ > arch/arm/include/asm/kvm_mmu.h | 9 + > arch/arm/include/asm/pgtable-3level.h | 9 + > arch/arm/include/asm/pgtable.h | 4 > arch/arm/kvm/Kconfig | 1 > arch/arm/kvm/arm.c | 38 +++ > arch/arm/kvm/exports.c | 1 > arch/arm/kvm/interrupts.S | 8 + > arch/arm/kvm/mmu.c | 373 +++++++++++++++++++++++++++++++++ > arch/arm/mm/mmu.c | 3 > 11 files changed, 465 insertions(+), 1 deletion(-) > > diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h > index 58d51e3..55b6446 100644 > --- a/arch/arm/include/asm/kvm_asm.h > +++ b/arch/arm/include/asm/kvm_asm.h > @@ -34,6 +34,7 @@ > #define SMCHYP_HVBAR_W 0xfffffff0 > > #ifndef __ASSEMBLY__ > +struct kvm; > struct kvm_vcpu; > > extern char __kvm_hyp_init[]; > @@ -48,6 +49,7 @@ extern char __kvm_hyp_code_start[]; > extern char __kvm_hyp_code_end[]; > > extern void __kvm_flush_vm_context(void); > +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); > > extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); > #endif > diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > index d7e3398..d86ce39 100644 > --- a/arch/arm/include/asm/kvm_host.h > +++ b/arch/arm/include/asm/kvm_host.h > @@ -157,4 +157,22 @@ struct kvm_vcpu_stat { > struct kvm_vcpu_init; > int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, > const struct kvm_vcpu_init *init); > + > +#define KVM_ARCH_WANT_MMU_NOTIFIER > +struct kvm; > +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); > +int kvm_unmap_hva_range(struct kvm *kvm, > + unsigned long start, unsigned long end); > +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); > + > +/* We do not have shadow page tables, hence the empty hooks */ > +static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) > +{ > + return 0; > +} > + > +static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) > +{ > + return 0; > +} > #endif /* __ARM_KVM_HOST_H__ */ > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h > index 8252921..11f4c3a 100644 > --- a/arch/arm/include/asm/kvm_mmu.h > +++ b/arch/arm/include/asm/kvm_mmu.h > @@ -33,4 +33,13 @@ int create_hyp_mappings(void *from, void *to); > int create_hyp_io_mappings(void *from, void *to, phys_addr_t); > void free_hyp_pmds(void); > > +int kvm_alloc_stage2_pgd(struct kvm *kvm); > +void kvm_free_stage2_pgd(struct kvm *kvm); > +int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > + phys_addr_t pa, unsigned long size); > + > +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); > + > +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); > + > #endif /* __ARM_KVM_MMU_H__ */ > diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h > index 1169a8a..7351eee 100644 > --- a/arch/arm/include/asm/pgtable-3level.h > +++ b/arch/arm/include/asm/pgtable-3level.h > @@ -102,6 +102,15 @@ > */ > #define L_PGD_SWAPPER (_AT(pgdval_t, 1) << 55) /* swapper_pg_dir entry */ > > +/* > + * 2-nd stage PTE definitions for LPAE. > + */ > +#define L_PTE2_SHARED L_PTE_SHARED > +#define L_PTE2_READ (_AT(pteval_t, 1) << 6) /* HAP[0] */ > +#define L_PTE2_WRITE (_AT(pteval_t, 1) << 7) /* HAP[1] */ > +#define L_PTE2_NORM_WB (_AT(pteval_t, 3) << 4) /* MemAttr[3:2] */ > +#define L_PTE2_INNER_WB (_AT(pteval_t, 3) << 2) /* MemAttr[1:0] */ > + > #ifndef __ASSEMBLY__ > > #define pud_none(pud) (!pud_val(pud)) > diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h > index bc83540..a31d0e9 100644 > --- a/arch/arm/include/asm/pgtable.h > +++ b/arch/arm/include/asm/pgtable.h > @@ -70,6 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t); > > extern pgprot_t pgprot_user; > extern pgprot_t pgprot_kernel; > +extern pgprot_t pgprot_guest; > > #define _MOD_PROT(p, b) __pgprot(pgprot_val(p) | (b)) > > @@ -83,6 +84,9 @@ extern pgprot_t pgprot_kernel; > #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) > #define PAGE_KERNEL_EXEC pgprot_kernel > #define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_USER) > +#define PAGE_KVM_GUEST _MOD_PROT(pgprot_guest, L_PTE2_READ | \ > + L_PTE2_NORM_WB | L_PTE2_INNER_WB | \ > + L_PTE2_SHARED) > > #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN) > #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) > diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig > index 83abbe0..7fa50d3 100644 > --- a/arch/arm/kvm/Kconfig > +++ b/arch/arm/kvm/Kconfig > @@ -36,6 +36,7 @@ config KVM_ARM_HOST > depends on KVM > depends on MMU > depends on CPU_V7 && ARM_VIRT_EXT > + select MMU_NOTIFIER > ---help--- > Provides host support for ARM processors. > > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index 0b1c466..3f97e7c 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -82,12 +82,34 @@ void kvm_arch_sync_events(struct kvm *kvm) > { > } > > +/** > + * kvm_arch_init_vm - initializes a VM data structure > + * @kvm: pointer to the KVM struct > + */ > int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > { > + int ret = 0; > + > if (type) > return -EINVAL; > > - return 0; > + ret = kvm_alloc_stage2_pgd(kvm); > + if (ret) > + goto out_fail_alloc; > + spin_lock_init(&kvm->arch.pgd_lock); > + > + ret = create_hyp_mappings(kvm, kvm + 1); > + if (ret) > + goto out_free_stage2_pgd; > + > + /* Mark the initial VMID generation invalid */ > + kvm->arch.vmid_gen = 0; > + > + return ret; > +out_free_stage2_pgd: > + kvm_free_stage2_pgd(kvm); > +out_fail_alloc: > + return ret; > } > > int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) > @@ -105,10 +127,16 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) > return 0; > } > > +/** > + * kvm_arch_destroy_vm - destroy the VM data structure > + * @kvm: pointer to the KVM struct > + */ > void kvm_arch_destroy_vm(struct kvm *kvm) > { > int i; > > + kvm_free_stage2_pgd(kvm); > + > for (i = 0; i < KVM_MAX_VCPUS; ++i) { > if (kvm->vcpus[i]) { > kvm_arch_vcpu_free(kvm->vcpus[i]); > @@ -184,7 +212,13 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) > if (err) > goto free_vcpu; > > + err = create_hyp_mappings(vcpu, vcpu + 1); > + if (err) > + goto vcpu_uninit; > + > return vcpu; > +vcpu_uninit: > + kvm_vcpu_uninit(vcpu); > free_vcpu: > kmem_cache_free(kvm_vcpu_cache, vcpu); > out: > @@ -193,6 +227,8 @@ out: > > void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) > { > + kvm_mmu_free_memory_caches(vcpu); > + kmem_cache_free(kvm_vcpu_cache, vcpu); > } > > void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > diff --git a/arch/arm/kvm/exports.c b/arch/arm/kvm/exports.c > index 8ebdf07..f39f823 100644 > --- a/arch/arm/kvm/exports.c > +++ b/arch/arm/kvm/exports.c > @@ -33,5 +33,6 @@ EXPORT_SYMBOL_GPL(__kvm_hyp_code_end); > EXPORT_SYMBOL_GPL(__kvm_vcpu_run); > > EXPORT_SYMBOL_GPL(__kvm_flush_vm_context); > +EXPORT_SYMBOL_GPL(__kvm_tlb_flush_vmid); > > EXPORT_SYMBOL_GPL(smp_send_reschedule); > diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S > index bf09801..edf9ed5 100644 > --- a/arch/arm/kvm/interrupts.S > +++ b/arch/arm/kvm/interrupts.S > @@ -31,6 +31,14 @@ __kvm_hyp_code_start: > .globl __kvm_hyp_code_start > > @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ > +@ Flush per-VMID TLBs > +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ > + > +ENTRY(__kvm_tlb_flush_vmid) > + bx lr > +ENDPROC(__kvm_tlb_flush_vmid) > + > +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ > @ Flush TLBs and instruction caches of current CPU for all VMIDs > @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index 6a7dfd4..6cb0e38 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -23,10 +23,43 @@ > #include <asm/pgalloc.h> > #include <asm/kvm_arm.h> > #include <asm/kvm_mmu.h> > +#include <asm/kvm_asm.h> > #include <asm/mach/map.h> > > static DEFINE_MUTEX(kvm_hyp_pgd_mutex); > > +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, > + int min, int max) > +{ > + void *page; > + > + BUG_ON(max > KVM_NR_MEM_OBJS); > + if (cache->nobjs >= min) > + return 0; > + while (cache->nobjs < max) { > + page = (void *)__get_free_page(PGALLOC_GFP); > + if (!page) > + return -ENOMEM; > + cache->objects[cache->nobjs++] = page; > + } > + return 0; > +} > + > +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) > +{ > + while (mc->nobjs) > + free_page((unsigned long)mc->objects[--mc->nobjs]); > +} > + > +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) > +{ > + void *p; > + > + BUG_ON(!mc || !mc->nobjs); > + p = mc->objects[--mc->nobjs]; > + return p; > +} > + > static void free_ptes(pmd_t *pmd, unsigned long addr) > { > pte_t *pte; > @@ -200,7 +233,347 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr) > return __create_hyp_mappings(from, to, &pfn); > } > > +/** > + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. > + * @kvm: The KVM struct pointer for the VM. > + * > + * Allocates the 1st level table only of size defined by PGD2_ORDER (can > + * support either full 40-bit input addresses or limited to 32-bit input > + * addresses). Clears the allocated pages. > + * > + * Note we don't need locking here as this is only called when the VM is > + * created, which can only be done once. > + */ > +int kvm_alloc_stage2_pgd(struct kvm *kvm) > +{ > + pgd_t *pgd; > + > + if (kvm->arch.pgd != NULL) { > + kvm_err("kvm_arch already initialized?\n"); > + return -EINVAL; > + } > + > + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD2_ORDER); > + if (!pgd) > + return -ENOMEM; > + > + memset(pgd, 0, PTRS_PER_PGD2 * sizeof(pgd_t)); > + kvm->arch.pgd = pgd; > + > + return 0; > +} > + > +static void free_guest_pages(pte_t *pte, unsigned long addr) > +{ > + unsigned int i; > + struct page *page, *pte_page; > + > + pte_page = virt_to_page(pte); > + > + for (i = 0; i < PTRS_PER_PTE; i++) { > + if (pte_present(*pte)) { > + unsigned long pfn = pte_pfn(*pte); > + > + if (pfn_valid(pfn)) { /* Skip over device memory */ > + page = pfn_to_page(pfn); > + put_page(page); > + } > + put_page(pte_page); > + } > + pte++; > + } > +} > + > +static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) > +{ > + unsigned int i; > + pte_t *pte; > + struct page *page, *pmd_page; > + > + pmd_page = virt_to_page(pmd); > + > + for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { > + BUG_ON(pmd_sect(*pmd)); > + if (!pmd_none(*pmd) && pmd_table(*pmd)) { > + pte = pte_offset_kernel(pmd, addr); > + free_guest_pages(pte, addr); > + page = virt_to_page((void *)pte); > + WARN_ON(page_count(page) != 1); > + pte_free_kernel(NULL, pte); > + > + put_page(pmd_page); > + } > + pmd++; > + } > +} > + > +/** > + * kvm_free_stage2_pgd - free all stage-2 tables > + * @kvm: The KVM struct pointer for the VM. > + * > + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all > + * underlying level-2 and level-3 tables before freeing the actual level-1 table > + * and setting the struct pointer to NULL. > + * > + * Note we don't need locking here as this is only called when the VM is > + * destroyed, which can only be done once. > + */ > +void kvm_free_stage2_pgd(struct kvm *kvm) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd; > + unsigned long long i, addr; > + struct page *page, *pud_page; > + > + if (kvm->arch.pgd == NULL) > + return; > + > + /* > + * We do this slightly different than other places, since we need more > + * than 32 bits and for instance pgd_addr_end converts to unsigned long. > + */ > + addr = 0; > + for (i = 0; i < PTRS_PER_PGD2; i++) { > + addr = i * (unsigned long long)PGDIR_SIZE; > + pgd = kvm->arch.pgd + i; > + pud = pud_offset(pgd, addr); > + pud_page = virt_to_page(pud); > + > + if (pud_none(*pud)) > + continue; > + > + BUG_ON(pud_bad(*pud)); > + > + pmd = pmd_offset(pud, addr); > + free_stage2_ptes(pmd, addr); > + page = virt_to_page((void *)pmd); > + WARN_ON(page_count(page) != 1); > + pmd_free(NULL, pmd); > + put_page(pud_page); > + } > + > + WARN_ON(page_count(pud_page) != 1); > + free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); > + kvm->arch.pgd = NULL; > +} > + > +/* > + * Clear a stage-2 PTE, lowering the various ref-counts. Also takes > + * care of invalidating the TLBs. Must be called while holding > + * pgd_lock, otherwise another faulting VCPU may come in and mess > + * things behind our back. > + */ > +static void stage2_clear_pte(struct kvm *kvm, phys_addr_t addr) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd; > + pte_t *pte; > + struct page *page; > + > + kvm_debug("Clearing PTE&%08llx\n", addr); > + pgd = kvm->arch.pgd + pgd_index(addr); > + pud = pud_offset(pgd, addr); > + BUG_ON(pud_none(*pud)); > + > + pmd = pmd_offset(pud, addr); > + BUG_ON(pmd_none(*pmd)); > + > + pte = pte_offset_kernel(pmd, addr); > + set_pte_ext(pte, __pte(0), 0); > + > + page = virt_to_page(pte); > + put_page(page); > + if (page_count(page) != 1) { > + __kvm_tlb_flush_vmid(kvm); > + return; > + } > + > + /* Need to remove pte page */ > + pmd_clear(pmd); > + __kvm_tlb_flush_vmid(kvm); > + pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK)); > + > + page = virt_to_page(pmd); > + put_page(page); > + if (page_count(page) != 1) > + return; > + > + /* > + * Need to remove pmd page. This is the worst case, and we end > + * up invalidating the TLB twice. No big deal. > + */ > + pud_clear(pud); > + __kvm_tlb_flush_vmid(kvm); > + pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); > + > + page = virt_to_page(pud); > + put_page(page); > +} > + > +static void stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > + phys_addr_t addr, const pte_t *new_pte) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd; > + pte_t *pte; > + > + /* Create 2nd stage page table mapping - Level 1 */ > + pgd = kvm->arch.pgd + pgd_index(addr); > + pud = pud_offset(pgd, addr); > + if (pud_none(*pud)) { > + if (!cache) > + return; /* ignore calls from kvm_set_spte_hva */ > + pmd = mmu_memory_cache_alloc(cache); > + pud_populate(NULL, pud, pmd); > + pmd += pmd_index(addr); > + get_page(virt_to_page(pud)); > + } else > + pmd = pmd_offset(pud, addr); > + > + /* Create 2nd stage page table mapping - Level 2 */ > + if (pmd_none(*pmd)) { > + if (!cache) > + return; /* ignore calls from kvm_set_spte_hva */ > + pte = mmu_memory_cache_alloc(cache); > + clean_pte_table(pte); > + pmd_populate_kernel(NULL, pmd, pte); > + pte += pte_index(addr); > + get_page(virt_to_page(pmd)); > + } else > + pte = pte_offset_kernel(pmd, addr); > + > + /* Create 2nd stage page table mapping - Level 3 */ > + BUG_ON(pte_none(pte)); > + set_pte_ext(pte, *new_pte, 0); > + get_page(virt_to_page(pte)); > +} > + > +/** > + * kvm_phys_addr_ioremap - map a device range to guest IPA > + * > + * @kvm: The KVM pointer > + * @guest_ipa: The IPA at which to insert the mapping > + * @pa: The physical address of the device > + * @size: The size of the mapping > + */ > +int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > + phys_addr_t pa, unsigned long size) > +{ > + phys_addr_t addr, end; > + pgprot_t prot; > + int ret = 0; > + unsigned long pfn; > + struct kvm_mmu_memory_cache cache = { 0, }; > + > + end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; > + prot = __pgprot(get_mem_type_prot_pte(MT_DEVICE) | L_PTE_USER | > + L_PTE2_READ | L_PTE2_WRITE); > + pfn = __phys_to_pfn(pa); > + > + for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { > + pte_t pte = pfn_pte(pfn, prot); > + > + ret = mmu_topup_memory_cache(&cache, 2, 2); > + if (ret) > + goto out; > + spin_lock(&kvm->arch.pgd_lock); > + stage2_set_pte(kvm, &cache, addr, &pte); > + spin_unlock(&kvm->arch.pgd_lock); > + > + pfn++; > + } > + > +out: > + mmu_free_memory_cache(&cache); > + return ret; > +} > + > int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > { > return -EINVAL; > } > + > +static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa) A single hva can have multiple gpas mapped, no? At least that's what I gathered from the discussion about my attempt to a function similar to this :). I'm also having a hard time following your mmu code in general. When do pages get mapped? Where do they get mapped from? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html