On exit to L0 user-space, always exit from L2 to L1 and synchronize the state properly for L1. This ensures that user-space only ever sees L1 state. It also allows L1 to be saved and resumed properly. Obviously horrible things will still happen to the L2 guest. This will be handled in a seperate patch. There is only a single case which requires a bit of extra care. When the decision to switch to user space happens while handling an L1 VMRESUME/VMLAUNCH (i.e. pending_nested_run). In order to handle this as cleanly as possible without major restructuring, we simply do not exit to user-space in this case and give L2 another chance to actually run. We also request an immediate exit to ensure that an exit to user space will still happen for the L2. The only reason I can see where an exit to user space will occur while L2 is running is because of a pending signal. The is how user space preempts the KVM_RUN in order to save the state. L2 exits are either handled in L0 kernel or reflected to L1 and not handled in L0 user-space. Signed-off-by: KarimAllah Ahmed <karahmed@xxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx.c | 39 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 33 ++++++++++++++++++++++++++++----- 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 318a414..2c8be56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -961,6 +961,8 @@ struct kvm_x86_ops { struct msr_bitmap_range *whitelist); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*prepare_exit_user)(struct kvm_vcpu *vcpu); + bool (*allow_exit_user)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 52539be..22eb0dc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2130,6 +2130,42 @@ static unsigned long segment_base(u16 selector) } #endif +static bool vmx_allow_exit_user(struct kvm_vcpu *vcpu) +{ + return !to_vmx(vcpu)->nested.nested_run_pending; +} + +static void vmx_prepare_exit_user(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.current_vmptr == -1ull) + return; + + /* + * If L2 is running no need to update vmcs12 from shadow VMCS. + * Just force an exit from L2 to L1 + */ + if (is_guest_mode(vcpu)) { + /* + * Pretend that an external interrupt occurred while L2 is + * running to cleanly exit into L1. + */ + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + /* Switch from L2 MMU to L1 MMU */ + kvm_mmu_reset_context(vcpu); + } else if (enable_shadow_vmcs) { + copy_shadow_to_vmcs12(vmx); + } + + /* Flush VMCS12 to guest memory */ + kvm_write_guest(vcpu->kvm, vmx->nested.current_vmptr, + get_vmcs12(vcpu), sizeof(*vmx->nested.cached_vmcs12)); + + return; +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -12440,6 +12476,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .whitelist_msrs = vmx_whitelist_msrs, + .prepare_exit_user = vmx_prepare_exit_user, + .allow_exit_user = vmx_allow_exit_user, + .prepare_guest_switch = vmx_save_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2cfbf39..8256a2d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -996,6 +996,12 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_rdpmc); +static __always_inline bool should_exit_user(struct kvm_vcpu *vcpu) +{ + return signal_pending(current) && (kvm_x86_ops->allow_exit_user ? + kvm_x86_ops->allow_exit_user(vcpu): true); +} + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -7187,8 +7193,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) - || need_resched() || signal_pending(current)) { + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched()) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -7198,6 +7203,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } + if (signal_pending(current)) { + if (kvm_x86_ops->allow_exit_user && + kvm_x86_ops->allow_exit_user(vcpu)) { + vcpu->mode = OUTSIDE_GUEST_MODE; + smp_wmb(); + local_irq_enable(); + preempt_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = 1; + goto cancel_injection; + } else + req_immediate_exit = true; + } + kvm_load_guest_xcr0(vcpu); if (req_immediate_exit) { @@ -7364,7 +7383,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { + if (should_exit_user(vcpu)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; @@ -7506,11 +7525,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - if (kvm_run->immediate_exit) + if (kvm_run->immediate_exit) { r = -EINTR; - else + } else { r = vcpu_run(vcpu); + if (kvm_x86_ops->prepare_exit_user) + kvm_x86_ops->prepare_exit_user(vcpu); + } + out: kvm_put_guest_fpu(vcpu); post_kvm_run_save(vcpu); -- 2.7.4