> On Aug 17, 2020, at 5:25 PM, Peter Shier <pshier@xxxxxxxxxx> wrote: > > Verify that when L2 guest enables PAE paging and L0 intercept of L2 > MOV to CR0 reflects MTF exit to L1, subsequent resume to L2 correctly > preserves PDPTE array specified by L2 CR3. > > Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> > Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> > Signed-off-by: Peter Shier <pshier@xxxxxxxxxx> > --- > lib/x86/asm/page.h | 8 +++ > x86/vmx_tests.c | 171 +++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 179 insertions(+) > > diff --git a/lib/x86/asm/page.h b/lib/x86/asm/page.h > index 7e2a3dd4b90a..1359eb74cde4 100644 > --- a/lib/x86/asm/page.h > +++ b/lib/x86/asm/page.h > @@ -36,10 +36,18 @@ typedef unsigned long pgd_t; > #define PT64_NX_MASK (1ull << 63) > #define PT_ADDR_MASK GENMASK_ULL(51, 12) > > +#define PDPTE64_PAGE_SIZE_MASK (1ull << 7) > +#define PDPTE64_RSVD_MASK GENMASK_ULL(51, cpuid_maxphyaddr()) > + > #define PT_AD_MASK (PT_ACCESSED_MASK | PT_DIRTY_MASK) > > +#define PAE_PDPTE_RSVD_MASK (GENMASK_ULL(63, cpuid_maxphyaddr()) | \ > + GENMASK_ULL(8, 5) | GENMASK_ULL(2, 1)) > + > + > #ifdef __x86_64__ > #define PAGE_LEVEL 4 > +#define PDPT_LEVEL 3 > #define PGDIR_WIDTH 9 > #define PGDIR_MASK 511 > #else > diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c > index 32e3d4f47b33..372e5efb6b5f 100644 > --- a/x86/vmx_tests.c > +++ b/x86/vmx_tests.c > @@ -5250,6 +5250,176 @@ static void vmx_mtf_test(void) > enter_guest(); > } > > +extern char vmx_mtf_pdpte_guest_begin; > +extern char vmx_mtf_pdpte_guest_end; > + > +asm("vmx_mtf_pdpte_guest_begin:\n\t" > + "mov %cr0, %rax\n\t" /* save CR0 with PG=1 */ > + "vmcall\n\t" /* on return from this CR0.PG=0 */ > + "mov %rax, %cr0\n\t" /* restore CR0.PG=1 to enter PAE mode */ > + "vmcall\n\t" > + "retq\n\t" > + "vmx_mtf_pdpte_guest_end:"); > + > +static void vmx_mtf_pdpte_test(void) > +{ > + void *test_mtf_pdpte_guest; > + pteval_t *pdpt; > + u32 guest_ar_cs; > + u64 guest_efer; > + pteval_t *pte; > + u64 guest_cr0; > + u64 guest_cr3; > + u64 guest_cr4; > + u64 ent_ctls; > + int i; > + > + if (setup_ept(false)) > + return; > + > + if (!(ctrl_cpu_rev[0].clr & CPU_MTF)) { > + printf("CPU does not support 'monitor trap flag.'\n"); > + return; > + } > + > + if (!(ctrl_cpu_rev[1].clr & CPU_URG)) { > + printf("CPU does not support 'unrestricted guest.'\n"); > + return; > + } > + > + vmcs_write(EXC_BITMAP, ~0); > + vmcs_write(CPU_EXEC_CTRL1, vmcs_read(CPU_EXEC_CTRL1) | CPU_URG); > + > + /* > + * Copy the guest code to an identity-mapped page. > + */ > + test_mtf_pdpte_guest = alloc_page(); > + memcpy(test_mtf_pdpte_guest, &vmx_mtf_pdpte_guest_begin, > + &vmx_mtf_pdpte_guest_end - &vmx_mtf_pdpte_guest_begin); > + > + test_set_guest(test_mtf_pdpte_guest); > + > + enter_guest(); > + skip_exit_vmcall(); > + > + /* > + * Put the guest in non-paged 32-bit protected mode, ready to enter > + * PAE mode when CR0.PG is set. CR4.PAE will already have been set > + * when the guest started out in long mode. > + */ > + ent_ctls = vmcs_read(ENT_CONTROLS); > + vmcs_write(ENT_CONTROLS, ent_ctls & ~ENT_GUEST_64); > + > + guest_efer = vmcs_read(GUEST_EFER); > + vmcs_write(GUEST_EFER, guest_efer & ~(EFER_LMA | EFER_LME)); > + > + /* > + * Set CS access rights bits for 32-bit protected mode: > + * 3:0 B execute/read/accessed > + * 4 1 code or data > + * 6:5 0 descriptor privilege level > + * 7 1 present > + * 11:8 0 reserved > + * 12 0 available for use by system software > + * 13 0 64 bit mode not active > + * 14 1 default operation size 32-bit segment > + * 15 1 page granularity: segment limit in 4K units > + * 16 0 segment usable > + * 31:17 0 reserved > + */ > + guest_ar_cs = vmcs_read(GUEST_AR_CS); > + vmcs_write(GUEST_AR_CS, 0xc09b); > + > + guest_cr0 = vmcs_read(GUEST_CR0); > + vmcs_write(GUEST_CR0, guest_cr0 & ~X86_CR0_PG); > + > + guest_cr4 = vmcs_read(GUEST_CR4); > + vmcs_write(GUEST_CR4, guest_cr4 & ~X86_CR4_PCIDE); > + > + guest_cr3 = vmcs_read(GUEST_CR3); > + > + /* > + * Turn the 4-level page table into a PAE page table by following the 0th > + * PML4 entry to a PDPT page, and grab the first four PDPTEs from that > + * page. > + * > + * Why does this work? > + * > + * PAE uses 32-bit addressing which implies: > + * Bits 11:0 page offset > + * Bits 20:12 entry into 512-entry page table > + * Bits 29:21 entry into a 512-entry directory table > + * Bits 31:30 entry into the page directory pointer table. > + * Bits 63:32 zero > + * > + * As only 2 bits are needed to select the PDPTEs for the entire > + * 32-bit address space, take the first 4 PDPTEs in the level 3 page > + * directory pointer table. It doesn't matter which of these PDPTEs > + * are present because they must cover the guest code given that it > + * has already run successfully. > + * > + * Get a pointer to PTE for GVA=0 in the page directory pointer table > + */ > + pte = get_pte_level((pgd_t *)(guest_cr3 & ~X86_CR3_PCID_MASK), 0, PDPT_LEVEL); > + > + /* > + * Need some memory for the 4-entry PAE page directory pointer > + * table. Use the end of the identity-mapped page where the guest code > + * is stored. There is definitely space as the guest code is only a > + * few bytes. > + */ > + pdpt = test_mtf_pdpte_guest + PAGE_SIZE - 4 * sizeof(pteval_t); > + > + /* > + * Copy the first four PDPTEs into the PAE page table with reserved > + * bits cleared. Note that permission bits from the PML4E and PDPTE > + * are not propagated. > + */ > + for (i = 0; i < 4; i++) { > + TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_RSVD_MASK), > + "PDPTE has invalid reserved bits"); > + TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_PAGE_SIZE_MASK), > + "Cannot use 1GB super pages for PAE"); > + pdpt[i] = pte[i] & ~(PAE_PDPTE_RSVD_MASK); > + } > + vmcs_write(GUEST_CR3, virt_to_phys(pdpt)); > + > + enable_mtf(); > + enter_guest(); This entry failed on my bare-metal machine: Test suite: vmx_mtf_pdpte_test VM-Exit failure on vmresume (reason=0x80000021, qual=0): invalid guest state Any idea why?