On Tue, Oct 16, 2018 at 9:50 AM Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> wrote: > > Per Hyper-V TLFS 5.0b: > > "The L1 hypervisor may choose to use enlightened VMCSs by writing 1 to > the corresponding field in the VP assist page (see section 7.8.7). > Another field in the VP assist page controls the currently active > enlightened VMCS. Each enlightened VMCS is exactly one page (4 KB) in > size and must be initially zeroed. No VMPTRLD instruction must be > executed to make an enlightened VMCS active or current. > > After the L1 hypervisor performs a VM entry with an enlightened VMCS, > the VMCS is considered active on the processor. An enlightened VMCS > can only be active on a single processor at the same time. The L1 > hypervisor can execute a VMCLEAR instruction to transition an > enlightened VMCS from the active to the non-active state. Any VMREAD > or VMWRITE instructions while an enlightened VMCS is active is > unsupported and can result in unexpected behavior." > > Keep Enlightened VMCS structure for the current L2 guest permanently mapped > from struct nested_vmx instead of mapping it every time. > > Suggested-by: Ladi Prosek <lprosek@xxxxxxxxxx> > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > arch/x86/kvm/vmx.c | 115 ++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 108 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index aebd008ccccb..cfb44acd4291 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -20,6 +20,7 @@ > #include "mmu.h" > #include "cpuid.h" > #include "lapic.h" > +#include "hyperv.h" > > #include <linux/kvm_host.h> > #include <linux/module.h> > @@ -889,6 +890,8 @@ struct nested_vmx { > bool guest_mode; > } smm; > > + gpa_t hv_evmcs_vmptr; > + struct page *hv_evmcs_page; > struct hv_enlightened_vmcs *hv_evmcs; > }; > > @@ -8111,11 +8114,13 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) > static int nested_vmx_failValid(struct kvm_vcpu *vcpu, > u32 vm_instruction_error) > { > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > /* > * failValid writes the error number to the current VMCS, which > * can't be done if there isn't a current VMCS. > */ > - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) > + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) > return nested_vmx_failInvalid(vcpu); > > vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) > @@ -8441,6 +8446,20 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) > vmcs_write64(VMCS_LINK_POINTER, -1ull); > } > > +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + if (!vmx->nested.hv_evmcs) > + return; > + > + kunmap(vmx->nested.hv_evmcs_page); > + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); > + vmx->nested.hv_evmcs_vmptr = -1ull; > + vmx->nested.hv_evmcs_page = NULL; > + vmx->nested.hv_evmcs = NULL; > +} > + > static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > @@ -8509,6 +8528,8 @@ static void free_nested(struct kvm_vcpu *vcpu) > > kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); > > + nested_release_evmcs(vcpu); > + > free_loaded_vmcs(&vmx->nested.vmcs02); > } > > @@ -8542,12 +8563,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) > return nested_vmx_failValid(vcpu, > VMXERR_VMCLEAR_VMXON_POINTER); > > - if (vmptr == vmx->nested.current_vmptr) > - nested_release_vmcs12(vcpu); > + if (vmx->nested.hv_evmcs_page) { > + if (vmptr == vmx->nested.hv_evmcs_vmptr) > + nested_release_evmcs(vcpu); > + } else { > + if (vmptr == vmx->nested.current_vmptr) > + nested_release_vmcs12(vcpu); > > - kvm_vcpu_write_guest(vcpu, > - vmptr + offsetof(struct vmcs12, launch_state), > - &zero, sizeof(zero)); > + kvm_vcpu_write_guest(vcpu, > + vmptr + offsetof(struct vmcs12, > + launch_state), > + &zero, sizeof(zero)); > + } > > return nested_vmx_succeed(vcpu); > } > @@ -8637,6 +8664,8 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) > struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; > struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; > > + vmcs12->hdr.revision_id = evmcs->revision_id; > + > /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ > vmcs12->tpr_threshold = evmcs->tpr_threshold; > vmcs12->guest_rip = evmcs->guest_rip; > @@ -9268,6 +9297,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) > return nested_vmx_failValid(vcpu, > VMXERR_VMPTRLD_VMXON_POINTER); > > + /* Forbid normal VMPTRLD if Enlightened version was used */ > + if (vmx->nested.hv_evmcs) > + return 1; > + > if (vmx->nested.current_vmptr != vmptr) { > struct vmcs12 *new_vmcs12; > struct page *page; > @@ -9301,6 +9334,68 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) > return nested_vmx_succeed(vcpu); > } > > +/* > + * This is an equivalent of the nested hypervisor executing the vmptrld > + * instruction. > + */ > +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct hv_vp_assist_page assist_page; > + > + if (likely(!vmx->nested.enlightened_vmcs_enabled)) > + return 1; > + > + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) > + return 1; > + > + if (unlikely(!assist_page.enlighten_vmentry)) > + return 1; > + > + if (unlikely(assist_page.current_nested_vmcs != > + vmx->nested.hv_evmcs_vmptr)) { > + > + if (!vmx->nested.hv_evmcs) > + vmx->nested.current_vmptr = -1ull; > + > + nested_release_evmcs(vcpu); > + > + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( > + vcpu, assist_page.current_nested_vmcs); > + > + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) > + return 0; > + > + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); Are you sure that directly mapping guest memory isn't going to lead to time-of-check vs. time-of-use bugs? This is a very hard programming model to get right.