> On Oct 10, 2020, at 1:17 AM, Nadav Amit <nadav.amit@xxxxxxxxx> wrote: > >> On Aug 17, 2020, at 5:25 PM, Peter Shier <pshier@xxxxxxxxxx> wrote: >> >> Verify that when L2 guest enables PAE paging and L0 intercept of L2 >> MOV to CR0 reflects MTF exit to L1, subsequent resume to L2 correctly >> preserves PDPTE array specified by L2 CR3. >> >> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> >> Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> >> Signed-off-by: Peter Shier <pshier@xxxxxxxxxx> >> --- >> lib/x86/asm/page.h | 8 +++ >> x86/vmx_tests.c | 171 +++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 179 insertions(+) >> >> diff --git a/lib/x86/asm/page.h b/lib/x86/asm/page.h >> index 7e2a3dd4b90a..1359eb74cde4 100644 >> --- a/lib/x86/asm/page.h >> +++ b/lib/x86/asm/page.h >> @@ -36,10 +36,18 @@ typedef unsigned long pgd_t; >> #define PT64_NX_MASK (1ull << 63) >> #define PT_ADDR_MASK GENMASK_ULL(51, 12) >> >> +#define PDPTE64_PAGE_SIZE_MASK (1ull << 7) >> +#define PDPTE64_RSVD_MASK GENMASK_ULL(51, cpuid_maxphyaddr()) >> + >> #define PT_AD_MASK (PT_ACCESSED_MASK | PT_DIRTY_MASK) >> >> +#define PAE_PDPTE_RSVD_MASK (GENMASK_ULL(63, cpuid_maxphyaddr()) | \ >> + GENMASK_ULL(8, 5) | GENMASK_ULL(2, 1)) >> + >> + >> #ifdef __x86_64__ >> #define PAGE_LEVEL 4 >> +#define PDPT_LEVEL 3 >> #define PGDIR_WIDTH 9 >> #define PGDIR_MASK 511 >> #else >> diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c >> index 32e3d4f47b33..372e5efb6b5f 100644 >> --- a/x86/vmx_tests.c >> +++ b/x86/vmx_tests.c >> @@ -5250,6 +5250,176 @@ static void vmx_mtf_test(void) >> enter_guest(); >> } >> >> +extern char vmx_mtf_pdpte_guest_begin; >> +extern char vmx_mtf_pdpte_guest_end; >> + >> +asm("vmx_mtf_pdpte_guest_begin:\n\t" >> + "mov %cr0, %rax\n\t" /* save CR0 with PG=1 */ >> + "vmcall\n\t" /* on return from this CR0.PG=0 */ >> + "mov %rax, %cr0\n\t" /* restore CR0.PG=1 to enter PAE mode */ >> + "vmcall\n\t" >> + "retq\n\t" >> + "vmx_mtf_pdpte_guest_end:"); >> + >> +static void vmx_mtf_pdpte_test(void) >> +{ >> + void *test_mtf_pdpte_guest; >> + pteval_t *pdpt; >> + u32 guest_ar_cs; >> + u64 guest_efer; >> + pteval_t *pte; >> + u64 guest_cr0; >> + u64 guest_cr3; >> + u64 guest_cr4; >> + u64 ent_ctls; >> + int i; >> + >> + if (setup_ept(false)) >> + return; >> + >> + if (!(ctrl_cpu_rev[0].clr & CPU_MTF)) { >> + printf("CPU does not support 'monitor trap flag.'\n"); >> + return; >> + } >> + >> + if (!(ctrl_cpu_rev[1].clr & CPU_URG)) { >> + printf("CPU does not support 'unrestricted guest.'\n"); >> + return; >> + } >> + >> + vmcs_write(EXC_BITMAP, ~0); >> + vmcs_write(CPU_EXEC_CTRL1, vmcs_read(CPU_EXEC_CTRL1) | CPU_URG); >> + >> + /* >> + * Copy the guest code to an identity-mapped page. >> + */ >> + test_mtf_pdpte_guest = alloc_page(); >> + memcpy(test_mtf_pdpte_guest, &vmx_mtf_pdpte_guest_begin, >> + &vmx_mtf_pdpte_guest_end - &vmx_mtf_pdpte_guest_begin); >> + >> + test_set_guest(test_mtf_pdpte_guest); >> + >> + enter_guest(); >> + skip_exit_vmcall(); >> + >> + /* >> + * Put the guest in non-paged 32-bit protected mode, ready to enter >> + * PAE mode when CR0.PG is set. CR4.PAE will already have been set >> + * when the guest started out in long mode. >> + */ >> + ent_ctls = vmcs_read(ENT_CONTROLS); >> + vmcs_write(ENT_CONTROLS, ent_ctls & ~ENT_GUEST_64); >> + >> + guest_efer = vmcs_read(GUEST_EFER); >> + vmcs_write(GUEST_EFER, guest_efer & ~(EFER_LMA | EFER_LME)); >> + >> + /* >> + * Set CS access rights bits for 32-bit protected mode: >> + * 3:0 B execute/read/accessed >> + * 4 1 code or data >> + * 6:5 0 descriptor privilege level >> + * 7 1 present >> + * 11:8 0 reserved >> + * 12 0 available for use by system software >> + * 13 0 64 bit mode not active >> + * 14 1 default operation size 32-bit segment >> + * 15 1 page granularity: segment limit in 4K units >> + * 16 0 segment usable >> + * 31:17 0 reserved >> + */ >> + guest_ar_cs = vmcs_read(GUEST_AR_CS); >> + vmcs_write(GUEST_AR_CS, 0xc09b); >> + >> + guest_cr0 = vmcs_read(GUEST_CR0); >> + vmcs_write(GUEST_CR0, guest_cr0 & ~X86_CR0_PG); >> + >> + guest_cr4 = vmcs_read(GUEST_CR4); >> + vmcs_write(GUEST_CR4, guest_cr4 & ~X86_CR4_PCIDE); >> + >> + guest_cr3 = vmcs_read(GUEST_CR3); >> + >> + /* >> + * Turn the 4-level page table into a PAE page table by following the 0th >> + * PML4 entry to a PDPT page, and grab the first four PDPTEs from that >> + * page. >> + * >> + * Why does this work? >> + * >> + * PAE uses 32-bit addressing which implies: >> + * Bits 11:0 page offset >> + * Bits 20:12 entry into 512-entry page table >> + * Bits 29:21 entry into a 512-entry directory table >> + * Bits 31:30 entry into the page directory pointer table. >> + * Bits 63:32 zero >> + * >> + * As only 2 bits are needed to select the PDPTEs for the entire >> + * 32-bit address space, take the first 4 PDPTEs in the level 3 page >> + * directory pointer table. It doesn't matter which of these PDPTEs >> + * are present because they must cover the guest code given that it >> + * has already run successfully. >> + * >> + * Get a pointer to PTE for GVA=0 in the page directory pointer table >> + */ >> + pte = get_pte_level((pgd_t *)(guest_cr3 & ~X86_CR3_PCID_MASK), 0, PDPT_LEVEL); >> + >> + /* >> + * Need some memory for the 4-entry PAE page directory pointer >> + * table. Use the end of the identity-mapped page where the guest code >> + * is stored. There is definitely space as the guest code is only a >> + * few bytes. >> + */ >> + pdpt = test_mtf_pdpte_guest + PAGE_SIZE - 4 * sizeof(pteval_t); >> + >> + /* >> + * Copy the first four PDPTEs into the PAE page table with reserved >> + * bits cleared. Note that permission bits from the PML4E and PDPTE >> + * are not propagated. >> + */ >> + for (i = 0; i < 4; i++) { >> + TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_RSVD_MASK), >> + "PDPTE has invalid reserved bits"); >> + TEST_ASSERT_EQ_MSG(0, (pte[i] & PDPTE64_PAGE_SIZE_MASK), >> + "Cannot use 1GB super pages for PAE"); >> + pdpt[i] = pte[i] & ~(PAE_PDPTE_RSVD_MASK); >> + } >> + vmcs_write(GUEST_CR3, virt_to_phys(pdpt)); >> + >> + enable_mtf(); >> + enter_guest(); > > This entry failed on my bare-metal machine: > > Test suite: vmx_mtf_pdpte_test > VM-Exit failure on vmresume (reason=0x80000021, qual=0): invalid guest state > > Any idea why? I guess that the test makes an assumption that there are no addresses greater than 4GB. When I reduce the size of the memory, the test passes.