When the existing KVM MMU code creates a shadow page table, it assumes it has the normal x86 page table format. This is obviously correct for normal shadow page tables, and also correct for AMD's NPT. Unfortunately, Intel's EPT page tables differ in subtle ways from ordinary page tables, so when we create a shadow EPT table (i.e., in nested EPT), we need to slightly modify the way in which this table table is built. In particular, when mmu.c's link_shadow_page() creates non-leaf page table entries, it used to enable the "present", "accessed", "writable" and "user" flags on these entries. While this is correct for ordinary page tables, it is wrong in EPT tables - where these bits actually have completely different meaning (compare PT_*_MASK from mmu.h to VMX_EPT_*_MASK from vmx.h). In particular, leaving the code as-is causes bit 5 of the PTE to be turned on (supposedly for PT_ACCESSED_MASK), which is a reserved bit in EPT and causes an "EPT Misconfiguration" failure. So we must move link_shadow_page's list of extra bits to a new mmu context field, which is set differently for nested EPT. Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 16 +++++++++------- arch/x86/kvm/paging_tmpl.h | 6 ++++-- arch/x86/kvm/vmx.c | 3 +++ 4 files changed, 17 insertions(+), 9 deletions(-) --- .before/arch/x86/include/asm/kvm_host.h 2011-11-10 11:33:59.000000000 +0200 +++ .after/arch/x86/include/asm/kvm_host.h 2011-11-10 11:33:59.000000000 +0200 @@ -287,6 +287,7 @@ struct kvm_mmu { bool nx; u64 pdptrs[4]; /* pae */ + u64 link_shadow_page_set_bits; }; struct kvm_vcpu_arch { --- .before/arch/x86/kvm/vmx.c 2011-11-10 11:33:59.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2011-11-10 11:33:59.000000000 +0200 @@ -6485,6 +6485,9 @@ static int nested_ept_init_mmu_context(s vcpu->arch.mmu.get_pdptr = nested_ept_get_pdptr; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu.shadow_root_level = get_ept_level(); + vcpu->arch.mmu.link_shadow_page_set_bits = + VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | + VMX_EPT_EXECUTABLE_MASK; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; --- .before/arch/x86/kvm/mmu.c 2011-11-10 11:33:59.000000000 +0200 +++ .after/arch/x86/kvm/mmu.c 2011-11-10 11:33:59.000000000 +0200 @@ -1782,14 +1782,9 @@ static void shadow_walk_next(struct kvm_ return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, u64 set_bits) { - u64 spte; - - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - mmu_spte_set(sptep, spte); + mmu_spte_set(sptep, __pa(sp->spt) | set_bits); } static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) @@ -3366,6 +3361,13 @@ int kvm_init_shadow_mmu(struct kvm_vcpu vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); + /* + * link_shadow() should apply these bits in shadow page tables, and + * in shadow NPT tables (nested NPT). For nested EPT, different bits + * apply. + */ + vcpu->arch.mmu.link_shadow_page_set_bits = PT_PRESENT_MASK | + PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; return r; } --- .before/arch/x86/kvm/paging_tmpl.h 2011-11-10 11:33:59.000000000 +0200 +++ .after/arch/x86/kvm/paging_tmpl.h 2011-11-10 11:33:59.000000000 +0200 @@ -515,7 +515,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, + vcpu->arch.mmu.link_shadow_page_set_bits); } for (; @@ -535,7 +536,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, + vcpu->arch.mmu.link_shadow_page_set_bits); } clear_sp_write_flooding_count(it.sptep); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html