On Thu, Apr 25, 2013 at 11:43:22PM -0700, Jun Nakajima wrote: > This is the first patch in a series which adds nested EPT support to KVM's > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > to set its own cr3 and take its own page faults without either of L0 or L1 > getting involved. This often significanlty improves L2's performance over the > previous two alternatives (shadow page tables over EPT, and shadow page > tables over shadow page tables). > > This patch adds EPT support to paging_tmpl.h. > > paging_tmpl.h contains the code for reading and writing page tables. The code > for 32-bit and 64-bit tables is very similar, but not identical, so > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > with PTTYPE=64, and this generates the two sets of similar functions. > > There are subtle but important differences between the format of EPT tables > and that of ordinary x86 64-bit page tables, so for nested EPT we need a > third set of functions to read the guest EPT table and to write the shadow > EPT table. > > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > with "EPT") which correctly read and write EPT tables. > > Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> > Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx> > Signed-off-by: Xinhao Xu <xinhao.xu@xxxxxxxxx> > --- > arch/x86/kvm/mmu.c | 35 ++---------- > arch/x86/kvm/paging_tmpl.h | 133 ++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 130 insertions(+), 38 deletions(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 956ca35..cb9c6fd 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, > return gfn_to_pfn_memslot_atomic(slot, gfn); > } > > -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, > - struct kvm_mmu_page *sp, u64 *spte, > - u64 gpte) > -{ > - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) > - goto no_present; > - > - if (!is_present_gpte(gpte)) > - goto no_present; > - > - if (!(gpte & PT_ACCESSED_MASK)) > - goto no_present; > - > - return false; > - > -no_present: > - drop_spte(vcpu->kvm, spte); > - return true; > -} > - > static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, > struct kvm_mmu_page *sp, > u64 *start, u64 *end) > @@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, > return false; > } > > -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) > -{ > - unsigned access; > - > - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; > - access &= ~(gpte >> PT64_NX_SHIFT); > - > - return access; > -} > - > static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) > { > unsigned index; > @@ -3418,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp > return mmu->last_pte_bitmap & (1 << index); > } > > +#define PTTYPE_EPT 18 /* arbitrary */ > +#define PTTYPE PTTYPE_EPT > +#include "paging_tmpl.h" > +#undef PTTYPE > + This breaks #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); At walk_addr_generic. > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index 105dd5b..e13b6c5 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -50,6 +50,22 @@ > #define PT_LEVEL_BITS PT32_LEVEL_BITS > #define PT_MAX_FULL_LEVELS 2 > #define CMPXCHG cmpxchg > +#elif PTTYPE == PTTYPE_EPT > + #define pt_element_t u64 > + #define guest_walker guest_walkerEPT > + #define FNAME(name) EPT_##name > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > + #ifdef CONFIG_X86_64 > + #define PT_MAX_FULL_LEVELS 4 > + #define CMPXCHG cmpxchg > + #else > + #define CMPXCHG cmpxchg64 > + #define PT_MAX_FULL_LEVELS 2 > + #endif > #else > #error Invalid PTTYPE value > #endif > @@ -80,6 +96,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) > return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; > } > > +#if PTTYPE != PTTYPE_EPT What is the reasoning for this? Please restrict #ifdef changes to code where there is a difference in pagetable format (that is, code that knows pagetable format). > static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > pt_element_t __user *ptep_user, unsigned index, > pt_element_t orig_pte, pt_element_t new_pte) > @@ -102,7 +119,52 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > return (ret != orig_pte); > } > +#endif > + > +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) > +{ > + unsigned access; > + > +#if PTTYPE == PTTYPE_EPT > + /* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */ > + access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK | > + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0); User bit is meaningless for when emulating EPT. Why return it? > +#else > + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; > + access &= ~(gpte >> PT64_NX_SHIFT); > +#endif > + > + return access; > +} > + > +static inline int FNAME(is_present_gpte)(unsigned long pte) > +{ > +#if PTTYPE == PTTYPE_EPT > + return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | > + VMX_EPT_EXECUTABLE_MASK); This should be VMX_EPT_READABLE_MASK (which is what maps to the normal pagetable present bit). > +#else > + return is_present_gpte(pte); > +#endif > +} > + > +static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu, > + bool write_fault, bool user_fault, > + unsigned long pte) > +{ Apparently unused. > +#if PTTYPE == PTTYPE_EPT > + if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK) > + && (user_fault || is_write_protection(vcpu)))) > + return false; > + return true; > +#else > + u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) > + | (write_fault ? PFERR_WRITE_MASK : 0); > > + return !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access); > +#endif > +} > + > +#if PTTYPE != PTTYPE_EPT Should emulate EPT dirty bit support? > static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > struct kvm_mmu *mmu, > struct guest_walker *walker, > @@ -139,6 +201,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > } > return 0; > } > +#endif > > /* > * Fetch a guest pte for a guest virtual address > @@ -147,7 +210,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > gva_t addr, u32 access) > { > - int ret; > pt_element_t pte; > pt_element_t __user *uninitialized_var(ptep_user); > gfn_t table_gfn; > @@ -162,7 +224,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > gfn_t gfn; > > trace_kvm_mmu_pagetable_walk(addr, access); > +#if PTTYPE != PTTYPE_EPT > retry_walk: > +#endif > walker->level = mmu->root_level; > pte = mmu->get_cr3(vcpu); > > @@ -215,17 +279,20 @@ retry_walk: > > trace_kvm_mmu_paging_element(pte, walker->level); > > - if (unlikely(!is_present_gpte(pte))) > + if (unlikely(!FNAME(is_present_gpte)(pte))) > goto error; > > if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, > walker->level))) { > - errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; > + errcode |= PFERR_PRESENT_MASK; > +#if PTTYPE != PTTYPE_EPT > + errcode |= PFERR_RSVD_MASK; > +#endif > goto error; > } > > accessed_dirty &= pte; > - pte_access = pt_access & gpte_access(vcpu, pte); > + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); > > walker->ptes[walker->level - 1] = pte; > } while (!is_last_gpte(mmu, walker->level, pte)); > @@ -247,6 +314,7 @@ retry_walk: > > walker->gfn = real_gpa >> PAGE_SHIFT; > > +#if PTTYPE != PTTYPE_EPT > if (!write_fault) > protect_clean_gpte(&pte_access, pte); > else > @@ -257,12 +325,15 @@ retry_walk: > accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); > > if (unlikely(!accessed_dirty)) { > + int ret; > + > ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); > if (unlikely(ret < 0)) > goto error; > else if (ret) > goto retry_walk; > } > +#endif > > walker->pt_access = pt_access; > walker->pte_access = pte_access; > @@ -293,6 +364,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, > access); > } > > +#if PTTYPE != PTTYPE_EPT > static int FNAME(walk_addr_nested)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, gva_t addr, > u32 access) > @@ -300,6 +372,29 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, > return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, > addr, access); > } > +#endif > + > +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, > + struct kvm_mmu_page *sp, u64 *spte, > + u64 gpte) > +{ > + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) > + goto no_present; > + > + if (!is_present_gpte(gpte)) > + goto no_present; > + > +#if PTTYPE != PTTYPE_EPT > + if (!(gpte & PT_ACCESSED_MASK)) > + goto no_present; > +#endif > + > + return false; > + > +no_present: > + drop_spte(vcpu->kvm, spte); > + return true; > +} Please split code move into a separate patch. > static bool > FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > @@ -309,13 +404,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > gfn_t gfn; > pfn_t pfn; > > - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) > + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) > return false; > > pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); > > gfn = gpte_to_gfn(gpte); > - pte_access = sp->role.access & gpte_access(vcpu, gpte); > + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); > protect_clean_gpte(&pte_access, gpte); > pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, > no_dirty_log && (pte_access & ACC_WRITE_MASK)); > @@ -394,6 +489,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, > } > } > > +#if PTTYPE == PTTYPE_EPT > +static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp) > +{ > + u64 spte; > + > + spte = __pa(sp->spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | > + VMX_EPT_EXECUTABLE_MASK; > + > + mmu_spte_set(sptep, spte); > +} > +#endif Should be setting the permission bits to map the L2 guest EPT pagetable entry? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html