On Thu, Jun 21, 2012 at 8:29 AM, Gleb Natapov <gleb at redhat.com> wrote: > On Fri, Jun 15, 2012 at 03:08:22PM -0400, Christoffer Dall wrote: >> From: Christoffer Dall <cdall at cs.columbia.edu> >> >> This commit introduces the framework for guest memory management >> through the use of 2nd stage translation. Each VM has a pointer >> to a level-1 table (the pgd field in struct kvm_arch) which is >> used for the 2nd stage translations. Entries are added when handling >> guest faults (later patch) and the table itself can be allocated and >> freed through the following functions implemented in >> arch/arm/kvm/arm_mmu.c: >> ?- kvm_alloc_stage2_pgd(struct kvm *kvm); >> ?- kvm_free_stage2_pgd(struct kvm *kvm); >> >> Further, each entry in TLBs and caches are tagged with a VMID >> identifier in addition to ASIDs. The VMIDs are assigned consecutively >> to VMs in the order that VMs are executed, and caches and tlbs are >> invalidated when the VMID space has been used to allow for more than >> 255 simultaenously running guests. >> >> The 2nd stage pgd is allocated in kvm_arch_init_vm(). The table is >> freed in kvm_arch_destroy_vm(). Both functions are called from the main >> KVM code. >> >> Signed-off-by: Christoffer Dall <c.dall at virtualopensystems.com> >> --- >> ?arch/arm/include/asm/kvm_arm.h | ? ?2 - >> ?arch/arm/include/asm/kvm_mmu.h | ? ?5 ++ >> ?arch/arm/kvm/arm.c ? ? ? ? ? ? | ? 65 ++++++++++++++++++++++--- >> ?arch/arm/kvm/mmu.c ? ? ? ? ? ? | ?103 ++++++++++++++++++++++++++++++++++++++++ >> ?4 files changed, 166 insertions(+), 9 deletions(-) >> >> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h >> index 7f30cbd..257242f 100644 >> --- a/arch/arm/include/asm/kvm_arm.h >> +++ b/arch/arm/include/asm/kvm_arm.h >> @@ -62,7 +62,7 @@ >> ? * SWIO: ? ? Turn set/way invalidates into set/way clean+invalidate >> ? */ >> ?#define HCR_GUEST_MASK (HCR_TSC | HCR_TWI | HCR_VM | HCR_BSU_IS | HCR_FB | \ >> - ? ? ? ? ? ? ? ? ? ? HCR_AMO | HCR_IMO | HCR_FMO | HCR_FMO | HCR_SWIO) >> + ? ? ? ? ? ? ? ? ? ? HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO) >> >> ?/* Hyp System Control Register (HSCTLR) bits */ >> ?#define HSCTLR_TE ? ?(1 << 30) >> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h >> index 1aa1af4..d95662eb 100644 >> --- a/arch/arm/include/asm/kvm_mmu.h >> +++ b/arch/arm/include/asm/kvm_mmu.h >> @@ -34,4 +34,9 @@ int kvm_hyp_pgd_alloc(void); >> ?pgd_t *kvm_hyp_pgd_get(void); >> ?void kvm_hyp_pgd_free(void); >> >> +int kvm_alloc_stage2_pgd(struct kvm *kvm); >> +void kvm_free_stage2_pgd(struct kvm *kvm); >> + >> +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); >> + >> ?#endif /* __ARM_KVM_MMU_H__ */ >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >> index efe130c..81babe9 100644 >> --- a/arch/arm/kvm/arm.c >> +++ b/arch/arm/kvm/arm.c >> @@ -38,6 +38,13 @@ >> >> ?static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); >> >> +/* The VMID used in the VTTBR */ >> +#define VMID_BITS ? ? ? ? ? ? ? 8 >> +#define VMID_MASK ? ? ? ? ? ? ? ((1 << VMID_BITS) - 1) >> +#define VMID_FIRST_GENERATION ? ? ? ?(1 << VMID_BITS) >> +static u64 next_vmid; ? ? ? ? ? ? ? ?/* The next available VMID in the sequence */ >> +DEFINE_SPINLOCK(kvm_vmid_lock); >> + >> ?int kvm_arch_hardware_enable(void *garbage) >> ?{ >> ? ? ? return 0; >> @@ -70,14 +77,6 @@ void kvm_arch_sync_events(struct kvm *kvm) >> ?{ >> ?} >> >> -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) >> -{ >> - ? ? if (type) >> - ? ? ? ? ? ? return -EINVAL; >> - >> - ? ? return 0; >> -} >> - >> ?int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) >> ?{ >> ? ? ? return VM_FAULT_SIGBUS; >> @@ -93,10 +92,46 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) >> ? ? ? return 0; >> ?} >> >> +/** >> + * kvm_arch_init_vm - initializes a VM data structure >> + * @kvm: ? ? pointer to the KVM struct >> + */ >> +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) >> +{ >> + ? ? int ret = 0; >> + >> + ? ? if (type) >> + ? ? ? ? ? ? return -EINVAL; >> + >> + ? ? ret = kvm_alloc_stage2_pgd(kvm); >> + ? ? if (ret) >> + ? ? ? ? ? ? goto out_fail_alloc; >> + ? ? mutex_init(&kvm->arch.pgd_mutex); >> + >> + ? ? ret = create_hyp_mappings(kvm, kvm + 1); >> + ? ? if (ret) >> + ? ? ? ? ? ? goto out_free_stage2_pgd; >> + >> + ? ? /* Mark the initial VMID invalid */ >> + ? ? kvm->arch.vmid = 0; >> + >> + ? ? return ret; >> +out_free_stage2_pgd: >> + ? ? kvm_free_stage2_pgd(kvm); >> +out_fail_alloc: >> + ? ? return ret; >> +} >> + >> +/** >> + * kvm_arch_destroy_vm - destroy the VM data structure >> + * @kvm: ? ? pointer to the KVM struct >> + */ >> ?void kvm_arch_destroy_vm(struct kvm *kvm) >> ?{ >> ? ? ? int i; >> >> + ? ? kvm_free_stage2_pgd(kvm); >> + >> ? ? ? for (i = 0; i < KVM_MAX_VCPUS; ++i) { >> ? ? ? ? ? ? ? if (kvm->vcpus[i]) { >> ? ? ? ? ? ? ? ? ? ? ? kvm_arch_vcpu_free(kvm->vcpus[i]); >> @@ -172,6 +207,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) >> ? ? ? if (err) >> ? ? ? ? ? ? ? goto free_vcpu; >> >> + ? ? err = create_hyp_mappings(vcpu, vcpu + 1); >> + ? ? if (err) >> + ? ? ? ? ? ? goto free_vcpu; >> + >> ? ? ? return vcpu; >> ?free_vcpu: >> ? ? ? kmem_cache_free(kvm_vcpu_cache, vcpu); >> @@ -181,6 +220,7 @@ out: >> >> ?void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) >> ?{ >> + ? ? kmem_cache_free(kvm_vcpu_cache, vcpu); >> ?} >> >> ?void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) >> @@ -416,6 +456,15 @@ int kvm_arch_init(void *opaque) >> ? ? ? if (err) >> ? ? ? ? ? ? ? goto out_err; >> >> + ? ? /* >> + ? ? ?* The upper 56 bits of VMIDs are used to identify the generation >> + ? ? ?* counter, so VMIDs initialized to 0, having generation == 0, will >> + ? ? ?* never be considered valid and therefor a new VMID must always be >> + ? ? ?* assigned. Whent he VMID generation rolls over, we start from >> + ? ? ?* VMID_FIRST_GENERATION again. >> + ? ? ?*/ >> + ? ? next_vmid = VMID_FIRST_GENERATION; >> + >> ? ? ? return 0; >> ?out_err: >> ? ? ? return err; >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index a320b56a..b256540 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -159,6 +159,109 @@ out: >> ? ? ? return err; >> ?} >> >> +/** >> + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. >> + * @kvm: ? ? The KVM struct pointer for the VM. >> + * >> + * Allocates the 1st level table only of size defined by PGD2_ORDER (can >> + * support either full 40-bit input addresses or limited to 32-bit input >> + * addresses). Clears the allocated pages. >> + */ >> +int kvm_alloc_stage2_pgd(struct kvm *kvm) >> +{ >> + ? ? pgd_t *pgd; >> + >> + ? ? if (kvm->arch.pgd != NULL) { >> + ? ? ? ? ? ? kvm_err("kvm_arch already initialized?\n"); >> + ? ? ? ? ? ? return -EINVAL; >> + ? ? } >> + >> + ? ? pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD2_ORDER); >> + ? ? if (!pgd) >> + ? ? ? ? ? ? return -ENOMEM; >> + >> + ? ? memset(pgd, 0, PTRS_PER_PGD2 * sizeof(pgd_t)); >> + ? ? kvm->arch.pgd = pgd; >> + >> + ? ? return 0; >> +} >> + >> +static void free_guest_pages(pte_t *pte, unsigned long addr) >> +{ >> + ? ? unsigned int i; >> + ? ? struct page *page; >> + >> + ? ? for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { > Hmm, "addr" is not used. > indeed it's not >> + ? ? ? ? ? ? if (!pte_present(*pte)) >> + ? ? ? ? ? ? ? ? ? ? goto next_page; > Why goto instead of: > historic reasons, thanks. > ? ? ? ? ? ? ?if(pte_present(*pte)) { >> + ? ? ? ? ? ? page = pfn_to_page(pte_pfn(*pte)); >> + ? ? ? ? ? ? put_page(page); > ? ? ? ? ? ? ?} > >> +next_page: >> + ? ? ? ? ? ? pte++; >> + ? ? } >> +} > >> + >> +static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) >> +{ >> + ? ? unsigned int i; >> + ? ? pte_t *pte; >> + ? ? struct page *page; >> + >> + ? ? for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { >> + ? ? ? ? ? ? BUG_ON(pmd_sect(*pmd)); >> + ? ? ? ? ? ? if (!pmd_none(*pmd) && pmd_table(*pmd)) { >> + ? ? ? ? ? ? ? ? ? ? pte = pte_offset_kernel(pmd, addr); >> + ? ? ? ? ? ? ? ? ? ? free_guest_pages(pte, addr); >> + ? ? ? ? ? ? ? ? ? ? page = virt_to_page((void *)pte); >> + ? ? ? ? ? ? ? ? ? ? WARN_ON(atomic_read(&page->_count) != 1); >> + ? ? ? ? ? ? ? ? ? ? pte_free_kernel(NULL, pte); >> + ? ? ? ? ? ? } >> + ? ? ? ? ? ? pmd++; >> + ? ? } >> +} >> + >> +/** >> + * kvm_free_stage2_pgd - free all stage-2 tables >> + * @kvm: ? ? The KVM struct pointer for the VM. >> + * >> + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all >> + * underlying level-2 and level-3 tables before freeing the actual level-1 table >> + * and setting the struct pointer to NULL. >> + */ >> +void kvm_free_stage2_pgd(struct kvm *kvm) >> +{ >> + ? ? pgd_t *pgd; >> + ? ? pud_t *pud; >> + ? ? pmd_t *pmd; >> + ? ? unsigned long long i, addr; >> + >> + ? ? if (kvm->arch.pgd == NULL) >> + ? ? ? ? ? ? return; >> + >> + ? ? /* >> + ? ? ?* We do this slightly different than other places, since we need more >> + ? ? ?* than 32 bits and for instance pgd_addr_end converts to unsigned long. >> + ? ? ?*/ >> + ? ? addr = 0; >> + ? ? for (i = 0; i < PTRS_PER_PGD2; i++) { >> + ? ? ? ? ? ? addr = i * (unsigned long long)PGDIR_SIZE; >> + ? ? ? ? ? ? pgd = kvm->arch.pgd + i; >> + ? ? ? ? ? ? pud = pud_offset(pgd, addr); >> + >> + ? ? ? ? ? ? if (pud_none(*pud)) >> + ? ? ? ? ? ? ? ? ? ? continue; >> + >> + ? ? ? ? ? ? BUG_ON(pud_bad(*pud)); >> + >> + ? ? ? ? ? ? pmd = pmd_offset(pud, addr); >> + ? ? ? ? ? ? free_stage2_ptes(pmd, addr); >> + ? ? ? ? ? ? pmd_free(NULL, pmd); >> + ? ? } >> + >> + ? ? free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); >> + ? ? kvm->arch.pgd = NULL; >> +} >> + >> ?int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >> ?{ >> ? ? ? return -EINVAL; >> >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majordomo at vger.kernel.org >> More majordomo info at ?http://vger.kernel.org/majordomo-info.html > > -- > ? ? ? ? ? ? ? ? ? ? ? ?Gleb.