On 22/09/20 07:23, yadong.qi@xxxxxxxxx wrote: > From: Yadong Qi <yadong.qi@xxxxxxxxx> > > Background: We have a lightweight HV, it needs INIT-VMExit and > SIPI-VMExit to wake-up APs for guests since it do not monitor > the Local APIC. But currently virtual wait-for-SIPI(WFS) state > is not supported in nVMX, so when running on top of KVM, the L1 > HV cannot receive the INIT-VMExit and SIPI-VMExit which cause > the L2 guest cannot wake up the APs. > > According to Intel SDM Chapter 25.2 Other Causes of VM Exits, > SIPIs cause VM exits when a logical processor is in > wait-for-SIPI state. > > In this patch: > 1. introduce SIPI exit reason, > 2. introduce wait-for-SIPI state for nVMX, > 3. advertise wait-for-SIPI support to guest. > > When L1 hypervisor is not monitoring Local APIC, L0 need to emulate > INIT-VMExit and SIPI-VMExit to L1 to emulate INIT-SIPI-SIPI for > L2. L2 LAPIC write would be traped by L0 Hypervisor(KVM), L0 should > emulate the INIT/SIPI vmexit to L1 hypervisor to set proper state > for L2's vcpu state. > > Handle procdure: > Source vCPU: > L2 write LAPIC.ICR(INIT). > L0 trap LAPIC.ICR write(INIT): inject a latched INIT event to target > vCPU. > Target vCPU: > L0 emulate an INIT VMExit to L1 if is guest mode. > L1 set guest VMCS, guest_activity_state=WAIT_SIPI, vmresume. > L0 set vcpu.mp_state to INIT_RECEIVED if (vmcs12.guest_activity_state > == WAIT_SIPI). > > Source vCPU: > L2 write LAPIC.ICR(SIPI). > L0 trap LAPIC.ICR write(INIT): inject a latched SIPI event to traget > vCPU. > Target vCPU: > L0 emulate an SIPI VMExit to L1 if (vcpu.mp_state == INIT_RECEIVED). > L1 set CS:IP, guest_activity_state=ACTIVE, vmresume. > L0 resume to L2. > L2 start-up. Again, this looks good but it needs testcases. Thanks, Paolo > Signed-off-by: Yadong Qi <yadong.qi@xxxxxxxxx> > --- > arch/x86/include/asm/vmx.h | 1 + > arch/x86/include/uapi/asm/vmx.h | 2 ++ > arch/x86/kvm/lapic.c | 5 ++-- > arch/x86/kvm/vmx/nested.c | 53 ++++++++++++++++++++++++--------- > 4 files changed, 45 insertions(+), 16 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index cd7de4b401fe..bff06dc64c52 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -113,6 +113,7 @@ > #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f > #define VMX_MISC_SAVE_EFER_LMA 0x00000020 > #define VMX_MISC_ACTIVITY_HLT 0x00000040 > +#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 > #define VMX_MISC_ZERO_LEN_INS 0x40000000 > #define VMX_MISC_MSR_LIST_MULTIPLIER 512 > > diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h > index b8ff9e8ac0d5..ada955c5ebb6 100644 > --- a/arch/x86/include/uapi/asm/vmx.h > +++ b/arch/x86/include/uapi/asm/vmx.h > @@ -32,6 +32,7 @@ > #define EXIT_REASON_EXTERNAL_INTERRUPT 1 > #define EXIT_REASON_TRIPLE_FAULT 2 > #define EXIT_REASON_INIT_SIGNAL 3 > +#define EXIT_REASON_SIPI_SIGNAL 4 > > #define EXIT_REASON_INTERRUPT_WINDOW 7 > #define EXIT_REASON_NMI_WINDOW 8 > @@ -94,6 +95,7 @@ > { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ > { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ > { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ > + { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \ > { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ > { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ > { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 5ccbee7165a2..d04ac7dc6adf 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -2839,7 +2839,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > > /* > * INITs are latched while CPU is in specific states > - * (SMM, VMX non-root mode, SVM with GIF=0). > + * (SMM, SVM with GIF=0). > * Because a CPU cannot be in these states immediately > * after it has processed an INIT signal (and thus in > * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs > @@ -2847,7 +2847,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > */ > if (kvm_vcpu_latch_init(vcpu)) { > WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); > - if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) > + if (test_bit(KVM_APIC_SIPI, &apic->pending_events) && > + !is_guest_mode(vcpu)) > clear_bit(KVM_APIC_SIPI, &apic->pending_events); > return; > } > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index 1bb6b31eb646..fe3bb68df987 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -2946,7 +2946,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, > static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) > { > if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && > - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) > + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && > + vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) > return -EINVAL; > > return 0; > @@ -3543,19 +3544,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > */ > nested_cache_shadow_vmcs12(vcpu, vmcs12); > > - /* > - * If we're entering a halted L2 vcpu and the L2 vcpu won't be > - * awakened by event injection or by an NMI-window VM-exit or > - * by an interrupt-window VM-exit, halt the vcpu. > - */ > - if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && > - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && > - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && > - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && > - (vmcs12->guest_rflags & X86_EFLAGS_IF))) { > + switch (vmcs12->guest_activity_state) { > + case GUEST_ACTIVITY_HLT: > + /* > + * If we're entering a halted L2 vcpu and the L2 vcpu won't be > + * awakened by event injection or by an NMI-window VM-exit or > + * by an interrupt-window VM-exit, halt the vcpu. > + */ > + if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && > + !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && > + !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && > + (vmcs12->guest_rflags & X86_EFLAGS_IF))) { > + vmx->nested.nested_run_pending = 0; > + return kvm_vcpu_halt(vcpu); > + } > + break; > + case GUEST_ACTIVITY_WAIT_SIPI: > vmx->nested.nested_run_pending = 0; > - return kvm_vcpu_halt(vcpu); > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + break; > + default: > + break; > } > + > return 1; > > vmentry_failed: > @@ -3781,7 +3792,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) > return -EBUSY; > nested_vmx_update_pending_dbg(vcpu); > clear_bit(KVM_APIC_INIT, &apic->pending_events); > - nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); > + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) > + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); > + return 0; > + } > + > + if (lapic_in_kernel(vcpu) && > + test_bit(KVM_APIC_SIPI, &apic->pending_events)) { > + if (block_nested_events) > + return -EBUSY; > + > + clear_bit(KVM_APIC_SIPI, &apic->pending_events); > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) > + nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, > + apic->sipi_vector & 0xFFUL); > return 0; > } > > @@ -6471,7 +6495,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) > msrs->misc_low |= > MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | > VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | > - VMX_MISC_ACTIVITY_HLT; > + VMX_MISC_ACTIVITY_HLT | > + VMX_MISC_ACTIVITY_WAIT_SIPI; > msrs->misc_high = 0; > > /* >