On 08/01/2012 10:37 PM, Nadav Har'El wrote: > This is the first patch in a series which adds nested EPT support to KVM's > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > to set its own cr3 and take its own page faults without either of L0 or L1 > getting involved. This often significanlty improves L2's performance over the > previous two alternatives (shadow page tables over EPT, and shadow page > tables over shadow page tables). > > This patch adds EPT support to paging_tmpl.h. > > paging_tmpl.h contains the code for reading and writing page tables. The code > for 32-bit and 64-bit tables is very similar, but not identical, so > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > with PTTYPE=64, and this generates the two sets of similar functions. > > There are subtle but important differences between the format of EPT tables > and that of ordinary x86 64-bit page tables, so for nested EPT we need a > third set of functions to read the guest EPT table and to write the shadow > EPT table. > > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > with "EPT") which correctly read and write EPT tables. > Now, paging_tmpl.h becomes really untidy and hard to read, may be we need to abstract the specified operations depends on PTTYPE. > Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> > --- > arch/x86/kvm/mmu.c | 14 +---- > arch/x86/kvm/paging_tmpl.h | 98 ++++++++++++++++++++++++++++++++--- > 2 files changed, 96 insertions(+), 16 deletions(-) > > --- .before/arch/x86/kvm/mmu.c 2012-08-01 17:22:46.000000000 +0300 > +++ .after/arch/x86/kvm/mmu.c 2012-08-01 17:22:46.000000000 +0300 > @@ -1971,15 +1971,6 @@ static void shadow_walk_next(struct kvm_ > return __shadow_walk_next(iterator, *iterator->sptep); > } > > -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) > -{ > - u64 spte; > - > - spte = __pa(sp->spt) > - | PT_PRESENT_MASK | PT_ACCESSED_MASK > - | PT_WRITABLE_MASK | PT_USER_MASK; > - mmu_spte_set(sptep, spte); > -} > > static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, > unsigned direct_access) > @@ -3427,6 +3418,11 @@ static bool sync_mmio_spte(u64 *sptep, g > return false; > } > > +#define PTTYPE_EPT 18 /* arbitrary */ > +#define PTTYPE PTTYPE_EPT > +#include "paging_tmpl.h" > +#undef PTTYPE > + > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > --- .before/arch/x86/kvm/paging_tmpl.h 2012-08-01 17:22:46.000000000 +0300 > +++ .after/arch/x86/kvm/paging_tmpl.h 2012-08-01 17:22:46.000000000 +0300 > @@ -50,6 +50,22 @@ > #define PT_LEVEL_BITS PT32_LEVEL_BITS > #define PT_MAX_FULL_LEVELS 2 > #define CMPXCHG cmpxchg > +#elif PTTYPE == PTTYPE_EPT > + #define pt_element_t u64 > + #define guest_walker guest_walkerEPT > + #define FNAME(name) EPT_##name > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > + #ifdef CONFIG_X86_64 > + #define PT_MAX_FULL_LEVELS 4 > + #define CMPXCHG cmpxchg > + #else > + #define CMPXCHG cmpxchg64 > + #define PT_MAX_FULL_LEVELS 2 > + #endif Missing the case of FULL_LEVELS == 3? Oh, you mentioned it as PAE case in the PATCH 0. > #else > #error Invalid PTTYPE value > #endif > @@ -78,6 +94,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_ > return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; > } > > +#if PTTYPE != PTTYPE_EPT > static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > pt_element_t __user *ptep_user, unsigned index, > pt_element_t orig_pte, pt_element_t new_pte) > @@ -100,15 +117,22 @@ static int FNAME(cmpxchg_gpte)(struct kv > > return (ret != orig_pte); > } > +#endif > Note A/D bits are supported on new intel cpus, this function should be reworked for nept. I know you did not export this feather to guest, but we can reduce the difference between nept and other mmu models if A/D are supported. > static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, > bool last) > { > unsigned access; > > +#if PTTYPE == PTTYPE_EPT > + /* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */ > + access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK | > + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0); > +#else > access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; > if (last && !is_dirty_gpte(gpte)) > access &= ~ACC_WRITE_MASK; > +#endif > May be we can introduce PT_xxx_MASK to abstracter the access bits. > #if PTTYPE == 64 > if (vcpu->arch.mmu.nx) > @@ -135,6 +159,30 @@ static bool FNAME(is_last_gpte)(struct g > return false; > } > > +static inline int FNAME(is_present_gpte)(unsigned long pte) > +{ > +#if PTTYPE == PTTYPE_EPT > + return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | > + VMX_EPT_EXECUTABLE_MASK); > +#else > + return is_present_gpte(pte); > +#endif > +} > + Introduce PT_PRESENT_BITS can eliminate the dependence, and we need to rework is_present_gpte since it is used out of paging_tmpl.h. > +static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu, > + bool write_fault, bool user_fault, > + unsigned long pte) > +{ > +#if PTTYPE == PTTYPE_EPT > + if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK) > + && (user_fault || is_write_protection(vcpu)))) > + return false; > + return true; > +#else > + return check_write_user_access(vcpu, write_fault, user_fault, pte); > +#endif > +} > + Ditto, need to rework check_write_user_access. > /* > * Fetch a guest pte for a guest virtual address > */ > @@ -155,7 +203,9 @@ static int FNAME(walk_addr_generic)(stru > u16 errcode = 0; > > trace_kvm_mmu_pagetable_walk(addr, access); > +#if PTTYPE != PTTYPE_EPT > retry_walk: > +#endif > eperm = false; > walker->level = mmu->root_level; > pte = mmu->get_cr3(vcpu); > @@ -202,7 +252,7 @@ retry_walk: > > trace_kvm_mmu_paging_element(pte, walker->level); > > - if (unlikely(!is_present_gpte(pte))) > + if (unlikely(!FNAME(is_present_gpte)(pte))) > goto error; > > if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, > @@ -211,13 +261,16 @@ retry_walk: > goto error; > } > > - if (!check_write_user_access(vcpu, write_fault, user_fault, > - pte)) > + if (!FNAME(check_write_user_access)(vcpu, write_fault, > + user_fault, pte)) > eperm = true; > > #if PTTYPE == 64 > if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) > eperm = true; > +#elif PTTYPE == PTTYPE_EPT > + if (unlikely(fetch_fault && !(pte & VMX_EPT_EXECUTABLE_MASK))) > + eperm = true; > #endif > > last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); > @@ -225,12 +278,15 @@ retry_walk: > pte_access = pt_access & > FNAME(gpte_access)(vcpu, pte, true); > /* check if the kernel is fetching from user page */ > +#if PTTYPE != PTTYPE_EPT > if (unlikely(pte_access & PT_USER_MASK) && > kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) > if (fetch_fault && !user_fault) > eperm = true; > +#endif > } > > +#if PTTYPE != PTTYPE_EPT > if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { > int ret; > trace_kvm_mmu_set_accessed_bit(table_gfn, index, > @@ -245,6 +301,7 @@ retry_walk: > mark_page_dirty(vcpu->kvm, table_gfn); > pte |= PT_ACCESSED_MASK; > } > +#endif If A/D supported, these differences can be be removed? > > walker->ptes[walker->level - 1] = pte; > > @@ -283,6 +340,7 @@ retry_walk: > goto error; > } > > +#if PTTYPE != PTTYPE_EPT > if (write_fault && unlikely(!is_dirty_gpte(pte))) { > int ret; > > @@ -298,6 +356,7 @@ retry_walk: > pte |= PT_DIRTY_MASK; > walker->ptes[walker->level - 1] = pte; > } > +#endif > > walker->pt_access = pt_access; > walker->pte_access = pte_access; > @@ -328,6 +387,7 @@ static int FNAME(walk_addr)(struct guest > access); > } > > +#if PTTYPE != PTTYPE_EPT > static int FNAME(walk_addr_nested)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, gva_t addr, > u32 access) > @@ -335,6 +395,7 @@ static int FNAME(walk_addr_nested)(struc > return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, > addr, access); > } > +#endif > Hmm, you do not need the special walking functions? > static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, > struct kvm_mmu_page *sp, u64 *spte, > @@ -343,11 +404,13 @@ static bool FNAME(prefetch_invalid_gpte) > if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) > goto no_present; > > - if (!is_present_gpte(gpte)) > + if (!FNAME(is_present_gpte)(gpte)) > goto no_present; > > +#if PTTYPE != PTTYPE_EPT > if (!(gpte & PT_ACCESSED_MASK)) > goto no_present; > +#endif > > return false; > > @@ -458,6 +521,20 @@ static void FNAME(pte_prefetch)(struct k > pfn, true, true); > } > } > +static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp) > +{ > + u64 spte; > + > + spte = __pa(sp->spt) > +#if PTTYPE == PTTYPE_EPT > + | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK > + | VMX_EPT_EXECUTABLE_MASK; > +#else > + | PT_PRESENT_MASK | PT_ACCESSED_MASK > + | PT_WRITABLE_MASK | PT_USER_MASK; > +#endif > + mmu_spte_set(sptep, spte); > +} > > /* > * Fetch a shadow pte for a specific level in the paging hierarchy. > @@ -474,7 +551,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > unsigned direct_access; > struct kvm_shadow_walk_iterator it; > > - if (!is_present_gpte(gw->ptes[gw->level - 1])) > + if (!FNAME(is_present_gpte)(gw->ptes[gw->level - 1])) > return NULL; > > direct_access = gw->pte_access; > @@ -514,7 +591,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > goto out_gpte_changed; > > if (sp) > - link_shadow_page(it.sptep, sp); > + FNAME(link_shadow_page)(it.sptep, sp); > } > > for (; > @@ -534,10 +611,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > > sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, > true, direct_access, it.sptep); > - link_shadow_page(it.sptep, sp); > + FNAME(link_shadow_page)(it.sptep, sp); > } > > clear_sp_write_flooding_count(it.sptep); > + /* TODO: Consider if everything that set_spte() does is correct when > + the shadow page table is actually EPT. Most is fine (for direct_map) > + but it appears there they be a few wrong corner cases with > + PT_USER_MASK, PT64_NX_MASK, etc., and I need to review everything > + */ Maybe it is ok. But you need to care A/D bits (current, you did not export A/D bits to guest, however, it may be supported on L0). > mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, > user_fault, write_fault, emulate, it.level, > gw->gfn, pfn, prefault, map_writable); > @@ -733,6 +815,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv > return gpa; > } > > +#if PTTYPE != PTTYPE_EPT > static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, > u32 access, > struct x86_exception *exception) > @@ -751,6 +834,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(st > > return gpa; > } > +#endif Why it is not needed? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html