On Mon, Oct 14, 2019 at 10:59 AM Sean Christopherson <sean.j.christopherson@xxxxxxxxx> wrote: > > On Thu, Oct 10, 2019 at 04:28:19PM -0700, Jim Mattson wrote: > > If the "virtualize APIC accesses" VM-execution control is set in the > > VMCS, the APIC virtualization hardware is triggered when a page walk > > in VMX non-root mode terminates at a PTE wherein the address of the 4k > > page frame matches the APIC-access address specified in the VMCS. On > > hardware, the APIC-access address may be any valid 4k-aligned physical > > address. > > > > KVM's nVMX implementation enforces the additional constraint that the > > APIC-access address specified in the vmcs12 must be backed by > > cacheable memory in L1. If not, L0 will simply clear the "virtualize > > APIC accesses" VM-execution control in the vmcs02. > > > > The problem with this approach is that the L1 guest has arranged the > > vmcs12 EPT tables--or shadow page tables, if the "enable EPT" > > VM-execution control is clear in the vmcs12--so that the L2 guest > > physical address(es)--or L2 guest linear address(es)--that reference > > the L2 APIC map to the APIC-access address specified in the > > vmcs12. Without the "virtualize APIC accesses" VM-execution control in > > the vmcs02, the APIC accesses in the L2 guest will directly access the > > APIC-access page in L1. > > > > When there is no mapping whatsoever for the APIC-access address in L1, > > the L2 VM just loses the intended APIC virtualization. However, when > > the APIC-access address is mapped to an MMIO region in L1, the L2 > > guest gets direct access to the L1 MMIO device. For example, if the > > APIC-access address specified in the vmcs12 is 0xfee00000, then L2 > > gets direct access to L1's APIC. > > > > Since this vmcs12 configuration is something that KVM cannot > > faithfully emulate, the appropriate response is to exit to userspace > > with KVM_INTERNAL_ERROR_EMULATION. > > > > Fixes: fe3ef05c7572 ("KVM: nVMX: Prepare vmcs02 from vmcs01 and vmcs12") > > Reported-by: Dan Cross <dcross@xxxxxxxxxx> > > Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> > > Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> > > Change-Id: Ib501fe63266c1d831ce4d1d55e8688bc34a6844a > > --- > > v2 -> v3: Added default case to new switch in nested_vmx_run > > v1 -> v2: Added enum enter_vmx_status > > > > arch/x86/include/asm/kvm_host.h | 2 +- > > arch/x86/kvm/vmx/nested.c | 68 +++++++++++++++++++-------------- > > arch/x86/kvm/vmx/nested.h | 13 ++++++- > > arch/x86/kvm/x86.c | 8 +++- > > 4 files changed, 59 insertions(+), 32 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index 5d8056ff7390..0dee68560437 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1186,7 +1186,7 @@ struct kvm_x86_ops { > > int (*set_nested_state)(struct kvm_vcpu *vcpu, > > struct kvm_nested_state __user *user_kvm_nested_state, > > struct kvm_nested_state *kvm_state); > > - void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); > > + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); > > > > int (*smi_allowed)(struct kvm_vcpu *vcpu); > > int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > > index 5e231da00310..88b2f08aaaae 100644 > > --- a/arch/x86/kvm/vmx/nested.c > > +++ b/arch/x86/kvm/vmx/nested.c > > @@ -2927,7 +2927,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) > > static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, > > struct vmcs12 *vmcs12); > > > > -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) > > +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) > > { > > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > > struct vcpu_vmx *vmx = to_vmx(vcpu); > > @@ -2947,19 +2947,18 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) > > vmx->nested.apic_access_page = NULL; > > } > > page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); > > - /* > > - * If translation failed, no matter: This feature asks > > - * to exit when accessing the given address, and if it > > - * can never be accessed, this feature won't do > > - * anything anyway. > > - */ > > if (!is_error_page(page)) { > > vmx->nested.apic_access_page = page; > > hpa = page_to_phys(vmx->nested.apic_access_page); > > vmcs_write64(APIC_ACCESS_ADDR, hpa); > > } else { > > - secondary_exec_controls_clearbit(vmx, > > - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > > + pr_debug_ratelimited("%s: non-cacheable APIC-access address in vmcs12\n", > > + __func__); > > Hmm, "non-cacheable" is confusing, especially in the context of the APIC, > which needs to be mapped "uncacheable". Maybe just "invalid"? "Invalid" is not correct. L1 MMIO addresses are valid; they're just not cacheable. Perhaps: "vmcs12 APIC-access address references a page not backed by a memslot in L1"? > > + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; > > + vcpu->run->internal.suberror = > > + KVM_INTERNAL_ERROR_EMULATION; > > + vcpu->run->internal.ndata = 0; > > + return false; > > } > > } > > > > @@ -3004,6 +3003,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) > > exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); > > else > > exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); > > + return true; > > } > > > > /* > > @@ -3042,13 +3042,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > > /* > > * If from_vmentry is false, this is being called from state restore (either RSM > > * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. > > -+ * > > -+ * Returns: > > -+ * 0 - success, i.e. proceed with actual VMEnter > > -+ * 1 - consistency check VMExit > > -+ * -1 - consistency check VMFail > > + * > > + * Returns: > > + * ENTER_VMX_SUCCESS: Successfully entered VMX non-root mode > > "Enter VMX" usually refers to VMXON, e.g. the title of VMXON in the SDM is > "Enter VMX Operation". > > Maybe NVMX_ENTER_NON_ROOT_? How about NESTED_VMX_ENTER_NON_ROOT_MODE_STATUS_? > > + * ENTER_VMX_VMFAIL: Consistency check VMFail > > + * ENTER_VMX_VMEXIT: Consistency check VMExit > > + * ENTER_VMX_ERROR: KVM internal error > > Probably need to more explicit than VMX_ERROR, e.g. all of the VM-Fail > defines are prefixed with VMXERR_##. > > May ENTER_VMX_KVM_ERROR? (Or NVMX_ENTER_NON_ROOT_KVM_ERROR). NESTED_VMX_ENTER_NON_ROOT_MODE_STATUS_KVM_INTERNAL_ERROR? > > */ > > -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > > +enum enter_vmx_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, > > + bool from_vmentry) > > { > > struct vcpu_vmx *vmx = to_vmx(vcpu); > > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > > @@ -3091,11 +3093,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > > prepare_vmcs02_early(vmx, vmcs12); > > > > if (from_vmentry) { > > - nested_get_vmcs12_pages(vcpu); > > + if (unlikely(!nested_get_vmcs12_pages(vcpu))) > > + return ENTER_VMX_ERROR; > > > > if (nested_vmx_check_vmentry_hw(vcpu)) { > > vmx_switch_vmcs(vcpu, &vmx->vmcs01); > > - return -1; > > + return ENTER_VMX_VMFAIL; > > } > > > > if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) > > @@ -3159,7 +3162,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > > * returned as far as L1 is concerned. It will only return (and set > > * the success flag) when L2 exits (see nested_vmx_vmexit()). > > */ > > - return 0; > > + return ENTER_VMX_SUCCESS; > > > > /* > > * A failed consistency check that leads to a VMExit during L1's > > @@ -3175,14 +3178,14 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > > vmx_switch_vmcs(vcpu, &vmx->vmcs01); > > > > if (!from_vmentry) > > - return 1; > > + return ENTER_VMX_VMEXIT; > > > > load_vmcs12_host_state(vcpu, vmcs12); > > vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; > > vmcs12->exit_qualification = exit_qual; > > if (enable_shadow_vmcs || vmx->nested.hv_evmcs) > > vmx->nested.need_vmcs12_to_shadow_sync = true; > > - return 1; > > + return ENTER_VMX_VMEXIT; > > } > > > > /* > > @@ -3192,9 +3195,9 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > > static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > > { > > struct vmcs12 *vmcs12; > > + enum enter_vmx_status status; > > struct vcpu_vmx *vmx = to_vmx(vcpu); > > u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); > > - int ret; > > > > if (!nested_vmx_check_permission(vcpu)) > > return 1; > > @@ -3254,13 +3257,22 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > > * the nested entry. > > */ > > vmx->nested.nested_run_pending = 1; > > - ret = nested_vmx_enter_non_root_mode(vcpu, true); > > - vmx->nested.nested_run_pending = !ret; > > - if (ret > 0) > > - return 1; > > - else if (ret) > > - return nested_vmx_failValid(vcpu, > > - VMXERR_ENTRY_INVALID_CONTROL_FIELD); > > + status = nested_vmx_enter_non_root_mode(vcpu, true); > > What if we use a goto to bury the error handling at the end? That'd also > provide some flexibility with respect to handling each failure, e.g.: > > vmx->nested.nested_run_pending = 1; > status = nested_vmx_enter_non_root_mode(vcpu, true); > if (status != ENTER_VMX_SUCCESS) > goto vmenter_failed; > > ... > > return 1; > > vmenter_failed: > vmx->nested.nested_run_pending = 0; > if (status == ENTER_VMX_VMFAIL) > return nested_vmx_failValid(vcpu, > VMXERR_ENTRY_INVALID_CONTROL_FIELD); > > return status == ENTER_VMX_ERROR ? 0 : 1; > > or > > vmx->nested.nested_run_pending = 1; > status = nested_vmx_enter_non_root_mode(vcpu, true); > if (status != ENTER_VMX_SUCCESS) > goto vmenter_failed; > > ... > > return 1; > > vmenter_failed: > vmx->nested.nested_run_pending = 0; > if (status == ENTER_VMX_ERROR) > return 0; > if (status == ENTER_VMX_VMEXIT) > return 1; > > WARN_ON_ONCE(status != ENTER_VMX_VMFAIL); > return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); Sounds good. Thanks! > > + if (status != ENTER_VMX_SUCCESS) { > > + vmx->nested.nested_run_pending = 0; > > + switch (status) { > > + case ENTER_VMX_VMFAIL: > > + return nested_vmx_failValid(vcpu, > > + VMXERR_ENTRY_INVALID_CONTROL_FIELD); > > + case ENTER_VMX_VMEXIT: > > + return 1; > > + case ENTER_VMX_ERROR: > > + return 0; > > + default: > > + WARN_ON_ONCE(1); > > + break; > > + } > > + } > > > > /* Hide L1D cache contents from the nested guest. */ > > vmx->vcpu.arch.l1tf_flush_l1d = true; > > diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h > > index 187d39bf0bf1..07cf5cef86f6 100644 > > --- a/arch/x86/kvm/vmx/nested.h > > +++ b/arch/x86/kvm/vmx/nested.h > > @@ -6,6 +6,16 @@ > > #include "vmcs12.h" > > #include "vmx.h" > > > > +/* > > + * Status returned by nested_vmx_enter_non_root_mode(): > > + */ > > +enum enter_vmx_status { > > + ENTER_VMX_SUCCESS, /* Successfully entered VMX non-root mode */ > > + ENTER_VMX_VMFAIL, /* Consistency check VMFail */ > > + ENTER_VMX_VMEXIT, /* Consistency check VMExit */ > > + ENTER_VMX_ERROR, /* KVM internal error */ > > +}; > > + > > void vmx_leave_nested(struct kvm_vcpu *vcpu); > > void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, > > bool apicv); > > @@ -13,7 +23,8 @@ void nested_vmx_hardware_unsetup(void); > > __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); > > void nested_vmx_vcpu_setup(void); > > void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); > > -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); > > +enum enter_vmx_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, > > + bool from_vmentry); > > bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); > > void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, > > u32 exit_intr_info, unsigned long exit_qualification); > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index f26f8be4e621..627fd7ff3a28 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -7937,8 +7937,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > bool req_immediate_exit = false; > > > > if (kvm_request_pending(vcpu)) { > > - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) > > - kvm_x86_ops->get_vmcs12_pages(vcpu); > > + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { > > + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { > > + r = 0; > > + goto out; > > + } > > + } > > if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) > > kvm_mmu_unload(vcpu); > > if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) > > -- > > 2.23.0.581.g78d2f28ef7-goog > >