The hardware uses the guest-physical address and bits 11:7 of the address accessed to lookup the SPPT to fetch a write permission bit for the 128 byte wide sub-page region being accessed within the 4K guest-physical page. If the sub-page region write permission bit is set, the write is allowed; otherwise the write is disallowed and results in an EPT violation. Guest-physical pages mapped via leaf EPT-paging-structures for which the accumulated write-access bit and the SPP bits are both clear (0) generate EPT violations on memory writes accesses. Guest-physical pages mapped via EPT-paging-structure for which the accumulated write-access bit is set (1) allow writes, effectively ignoring the SPP bit on the leaf EPT-paging structure. Software will setup the spp page table level4,3,2 as well as EPT page structure, and fill the level1 via the 32 bit bitmap per a single 4K page. Now it could be divided to 32 x 128 sub-pages. Signed-off-by: Zhang Yi <yi.z.zhang@xxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 4 ++ arch/x86/kvm/mmu.c | 123 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3218d91..ce6d258 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1402,6 +1402,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); + +int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu, + u32 access_map, gfn_t gfn); + void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d512125..287ee62 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -206,6 +206,11 @@ static const union kvm_mmu_page_role mmu_base_role_mask = { ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ __shadow_walk_next(&(_walker), spte)) +#define for_each_shadow_spp_entry(_vcpu, _addr, _walker) \ + for (shadow_spp_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; @@ -476,6 +481,11 @@ static int is_shadow_present_pte(u64 pte) return (pte != 0) && !is_mmio_spte(pte); } +static int is_spp_mide_page_present(u64 pte) +{ + return pte & PT_PRESENT_MASK; +} + static int is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; @@ -495,6 +505,11 @@ static bool is_executable_pte(u64 spte) return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; } +static bool is_spp_spte(struct kvm_mmu_page *sp) +{ + return sp->role.spp; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -2606,6 +2621,16 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, addr); } +static void shadow_spp_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu->sppt_root; + + /* SPP Table is a 4-level paging structure */ + iterator->level = 4; +} + static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) @@ -2656,6 +2681,18 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, mark_unsync(sptep); } +static void link_spp_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) | PT_PRESENT_MASK; + + mmu_spte_set(sptep, spte); + + mmu_page_add_parent_pte(vcpu, sp, sptep); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2686,7 +2723,13 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, pte = *spte; if (is_shadow_present_pte(pte)) { - if (is_last_spte(pte, sp->role.level)) { + if (is_spp_spte(sp)) { + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + //spp page do not need to release rmap. + return true; + child = page_header(pte & PT64_BASE_ADDR_MASK); + drop_parent_pte(child, spte); + } else if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); if (is_large_pte(pte)) --kvm->stat.lpages; @@ -4231,6 +4274,77 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return RET_PF_RETRY; } +static u64 format_spp_spte(u32 spp_wp_bitmap) +{ + u64 new_spte = 0; + int i = 0; + + /* + * One 4K page contains 32 sub-pages, in SPP table L4E, old bits + * are reserved, so we need to transfer u32 subpage write + * protect bitmap to u64 SPP L4E format. + */ + while (i < 32) { + if (spp_wp_bitmap & (1ULL << i)) + new_spte |= 1ULL << (i * 2); + + i++; + } + + return new_spte; +} + +static void mmu_spp_spte_set(u64 *sptep, u64 new_spte) +{ + __set_spte(sptep, new_spte); +} + +int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu, + u32 access_map, gfn_t gfn) +{ + struct kvm_shadow_walk_iterator iter; + struct kvm_mmu_page *sp; + gfn_t pseudo_gfn; + u64 old_spte, spp_spte; + struct kvm *kvm = vcpu->kvm; + + spin_lock(&kvm->mmu_lock); + + /* direct_map spp start */ + + if (!VALID_PAGE(vcpu->arch.mmu->sppt_root)) + goto out_unlock; + + for_each_shadow_spp_entry(vcpu, (u64)gfn << PAGE_SHIFT, iter) { + if (iter.level == PT_PAGE_TABLE_LEVEL) { + spp_spte = format_spp_spte(access_map); + old_spte = mmu_spte_get_lockless(iter.sptep); + if (old_spte != spp_spte) { + mmu_spp_spte_set(iter.sptep, spp_spte); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + break; + } + + if (!is_spp_mide_page_present(*iter.sptep)) { + u64 base_addr = iter.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iter.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; + sp = kvm_mmu_get_spp_page(vcpu, pseudo_gfn, + iter.level - 1); + link_spp_shadow_page(vcpu, iter.sptep, sp); + } + } + + spin_unlock(&kvm->mmu_lock); + return 0; + +out_unlock: + spin_unlock(&kvm->mmu_lock); + return -EFAULT; +} + int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info) { u32 *access = spp_info->access_map; @@ -4255,9 +4369,10 @@ int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info) gfn_t gfn = spp_info->base_gfn; int npages = spp_info->npages; struct kvm_memory_slot *slot; + struct kvm_vcpu *vcpu; u32 *wp_map; int ret; - int i; + int i, j; for (i = 0; i < npages; i++, gfn++) { slot = gfn_to_memslot(kvm, gfn); @@ -4281,6 +4396,10 @@ int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info) "Please try to disable the huge page\n", gfn); return -EFAULT; } + + kvm_for_each_vcpu(j, vcpu, kvm) + kvm_mmu_setup_spp_structure(vcpu, access, gfn); + wp_map = gfn_to_subpage_wp_info(slot, gfn); *wp_map = access; } -- 2.7.4