From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> Currently pae_root is special root page, this patch adds facility to allow using kvm_mmu_get_page() to allocate pae_root shadow page. When kvm_mmu_get_page() is called for role.level == PT32E_ROOT_LEVEL and vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL, it will get a PAE root pagetable and set role.pae_root=1 for freeing. The role.pae_root bit is needed in the page role because: o PAE roots must be allocated below 4gb (for kvm_mmu_get_page()) o PAE roots can not be encrypted (for kvm_mmu_get_page()) o Must be re-encrypted when freeing (for kvm_mmu_free_page()) o PAE root's PDPTE is special (for link_shadow_page()) o Not share the decrypted low-address pagetable with non-PAE-root ones or vice verse. (for kvm_mmu_get_page(), the crucial reason) Both role.pae_root in link_shadow_page() and in kvm_mmu_get_page() can be possible changed to use shadow_root_level and role.level instead. But in kvm_mmu_free_page(), it can't use vcpu->arch.mmu->shadow_root_level. PAE roots must be allocated below 4gb (CR3 has only 32 bits). So a cache is introduced (mmu_pae_root_cache). No functionality changed since this code is not activated because when vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL, kvm_mmu_get_page() is only called for level == 1 or 2 now. Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> --- Documentation/virt/kvm/mmu.rst | 2 + arch/x86/include/asm/kvm_host.h | 9 +++- arch/x86/kvm/mmu/mmu.c | 78 +++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu/paging_tmpl.h | 1 + 4 files changed, 86 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index dee0e96d694a..800f1eba55b3 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -209,6 +209,8 @@ Shadow pages contain the following information: top with role.glevel = guest paging level and acks as passthrough sp and its contents are specially installed rather than the translations of the corresponding guest pagetable. + role.pae_root: + Is 1 if it is a PAE root. gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 67e1bccaf472..658c493e7617 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -313,6 +313,11 @@ struct kvm_kernel_irq_routing_entry; * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * + * - pae_root can only be set when level=3, so combinations for level and + * pae_root can be seen as 2/3/3-page_root/4/5, a.k.a 5 possibilities. + * Combined with cr0_wp, smep_andnot_wp and smap_andnot_wp, it will be + * 5X5 = 25 < 2^5. + * * Therefore, the maximum number of possible upper-level shadow pages for a * single gfn is a bit less than 2^15. */ @@ -332,7 +337,8 @@ union kvm_mmu_page_role { unsigned ad_disabled:1; unsigned guest_mode:1; unsigned glevel:4; - unsigned :2; + unsigned pae_root:1; + unsigned :1; /* * This is left at the top of the word so that @@ -699,6 +705,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_shadow_page_cache; struct kvm_mmu_memory_cache mmu_gfn_array_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + void *mmu_pae_root_cache; /* * QEMU userspace and the guest each have their own FPU state. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d53037df8177..81ccaa7c1165 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -694,6 +694,35 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) } } +static int mmu_topup_pae_root_cache(struct kvm_vcpu *vcpu) +{ + struct page *page; + + if (vcpu->arch.mmu->shadow_root_level != PT32E_ROOT_LEVEL) + return 0; + if (vcpu->arch.mmu_pae_root_cache) + return 0; + + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_DMA32); + if (!page) + return -ENOMEM; + vcpu->arch.mmu_pae_root_cache = page_address(page); + + /* + * CR3 is only 32 bits when PAE paging is used, thus it's impossible to + * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so + * that KVM's writes and the CPU's reads get along. Note, this is + * only necessary when using shadow paging, as 64-bit NPT can get at + * the C-bit even when shadowing 32-bit NPT, and SME isn't supported + * by 32-bit kernels (when KVM itself uses 32-bit NPT). + */ + if (!tdp_enabled) + set_memory_decrypted((unsigned long)vcpu->arch.mmu_pae_root_cache, 1); + else + WARN_ON_ONCE(shadow_me_mask); + return 0; +} + static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; @@ -705,6 +734,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) return r; r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); + if (r) + return r; + r = mmu_topup_pae_root_cache(vcpu); if (r) return r; if (maybe_indirect) { @@ -717,12 +749,23 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) PT64_ROOT_MAX_LEVEL); } +static void mmu_free_pae_root(void *root_pt) +{ + if (!tdp_enabled) + set_memory_encrypted((unsigned long)root_pt, 1); + free_page((unsigned long)root_pt); +} + static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + if (vcpu->arch.mmu_pae_root_cache) { + mmu_free_pae_root(vcpu->arch.mmu_pae_root_cache); + vcpu->arch.mmu_pae_root_cache = NULL; + } } static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) @@ -1682,7 +1725,10 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - free_page((unsigned long)sp->spt); + if (sp->role.pae_root) + mmu_free_pae_root(sp->spt); + else + free_page((unsigned long)sp->spt); free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1720,7 +1766,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + if (!role.pae_root) { + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + } else { + sp->spt = vcpu->arch.mmu_pae_root_cache; + vcpu->arch.mmu_pae_root_cache = NULL; + } if (role.glevel == role.level) sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -2064,6 +2115,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } if (level < role.glevel) role.glevel = level; + if (level != PT32E_ROOT_LEVEL) + role.pae_root = 0; sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; for_each_valid_sp(vcpu->kvm, sp, sp_list) { @@ -2199,14 +2252,26 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } +static u64 make_pae_pdpte(u64 *child_pt) +{ + /* The only ignore bits in PDPTE are 11:9. */ + BUILD_BUG_ON(!(GENMASK(11,9) & SPTE_MMU_PRESENT_MASK)); + return __pa(child_pt) | PT_PRESENT_MASK | SPTE_MMU_PRESENT_MASK | + shadow_me_mask; +} + static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); + if (!parent_sp->role.pae_root) + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); + else + spte = make_pae_pdpte(sp->spt); mmu_spte_set(sptep, spte); @@ -4782,6 +4847,8 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.has_4_byte_gpte = false; + if (role.base.level == PT32E_ROOT_LEVEL) + role.base.pae_root = 1; return role; } @@ -4848,6 +4915,9 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, else role.base.level = PT64_ROOT_4LEVEL; + if (role.base.level == PT32E_ROOT_LEVEL) + role.base.pae_root = 1; + return role; } @@ -4893,6 +4963,8 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, role.base.direct = false; role.base.level = kvm_mmu_get_tdp_level(vcpu); + if (role.base.level == PT32E_ROOT_LEVEL) + role.base.pae_root = 1; return role; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 67489a060eba..1015f33e0758 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1043,6 +1043,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) .access = 0x7, .quadrant = 0x3, .glevel = 0xf, + .pae_root = 0x1, }; /* -- 2.19.1.6.gb485710b