On 2013-03-12 14:12, Gleb Natapov wrote: > On Tue, Mar 12, 2013 at 12:44:41PM +0100, Jan Kiszka wrote: >> A VCPU sending INIT or SIPI to some other VCPU races for setting the >> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >> was overwritten by kvm_emulate_halt and, thus, got lost. >> >> This introduces APIC events for those two signals, keeping them in >> kvm_apic until kvm_apic_accept_events is run over the target vcpu >> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there >> are pending events, thus if vcpu blocking should end. >> >> The patch comes with the side effect of effectively obsoleting >> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but >> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. >> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED >> state. That also means we no longer exit to user space after receiving a >> SIPI event. >> >> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> >> --- >> >> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving >> this honor to Paolo. >> >> I didn't try porting any INIT handling for nested on top yet but I >> think it should be feasible - once we know their semantics for sure, at >> least on Intel... >> >> arch/x86/include/asm/kvm_host.h | 1 + >> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- >> arch/x86/kvm/lapic.h | 7 ++++++ >> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- >> 4 files changed, 63 insertions(+), 25 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 348d859..2d28e76 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); >> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); >> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); >> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); >> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> >> void kvm_define_shared_msr(unsigned index, u32 msr); >> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index 02b51dd..4a21a6b 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_INIT: >> if (!trig_mode || level) { >> result = 1; >> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + set_bit(KVM_APIC_INIT, &apic->pending_events); >> kvm_make_request(KVM_REQ_EVENT, vcpu); >> kvm_vcpu_kick(vcpu); >> } else { >> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> case APIC_DM_STARTUP: >> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >> vcpu->vcpu_id, vector); >> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> - result = 1; >> - vcpu->arch.sipi_vector = vector; >> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >> - kvm_make_request(KVM_REQ_EVENT, vcpu); >> - kvm_vcpu_kick(vcpu); >> - } >> + result = 1; >> + apic->sipi_vector = vector; >> + /* make sure sipi_vector is visible for the receiver */ >> + smp_wmb(); >> + set_bit(KVM_APIC_SIPI, &apic->pending_events); >> + kvm_make_request(KVM_REQ_EVENT, vcpu); >> + kvm_vcpu_kick(vcpu); >> break; >> >> case APIC_DM_EXTINT: >> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) >> addr); >> } >> >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) >> +{ >> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; > The function is called only from kvm_arch_vcpu_runnable() and > kvm_arch_vcpu_runnable() is called only if irqchip is in kernel, so I > think kvm_vcpu_has_lapic() can be dropped. Likely, will check again. > >> +} >> + >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) >> +{ >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + >> + if (!kvm_vcpu_has_lapic(vcpu)) >> + return; >> + >> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) >> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && >> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + vcpu->arch.sipi_vector = apic->sipi_vector; >> + pr_debug("vcpu %d received sipi with vector # %x\n", >> + vcpu->vcpu_id, vcpu->arch.sipi_vector); >> + kvm_lapic_reset(vcpu); >> + kvm_vcpu_reset(vcpu); > Shouldn't we reset on KVM_APIC_INIT? It is reset a few lines above. Yes, there could be another INIT pending by now, but that is racing with SIPI anyway and could also arrive a few instruction later. So I don't think we need to worry. > >> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> + } >> +} >> + >> void kvm_lapic_init(void) >> { >> /* do not patch jump label more than once per second */ >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >> index 1676d34..ef3f4ef 100644 >> --- a/arch/x86/kvm/lapic.h >> +++ b/arch/x86/kvm/lapic.h >> @@ -5,6 +5,9 @@ >> >> #include <linux/kvm_host.h> >> >> +#define KVM_APIC_INIT 0 >> +#define KVM_APIC_SIPI 1 >> + >> struct kvm_timer { >> struct hrtimer timer; >> s64 period; /* unit: ns */ >> @@ -32,13 +35,17 @@ struct kvm_lapic { >> void *regs; >> gpa_t vapic_addr; >> struct page *vapic_page; >> + unsigned long pending_events; >> + unsigned int sipi_vector; >> }; >> int kvm_create_lapic(struct kvm_vcpu *vcpu); >> void kvm_free_lapic(struct kvm_vcpu *vcpu); >> >> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); >> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); >> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); >> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); >> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); >> void kvm_lapic_reset(struct kvm_vcpu *vcpu); >> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); >> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index b891ac3..a0b8041 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; >> >> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); >> >> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >> - >> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) >> { >> int i; >> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> bool req_immediate_exit = 0; >> >> if (vcpu->requests) { >> + kvm_apic_accept_events(vcpu); >> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >> + r = 1; >> + goto out; >> + } > Why not call it under kvm_check_request(KVM_REQ_EVENT, vcpu) bellow? I probably had the desire to handle this event and potential exit reason early. Not sure if it makes any difference. > >> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >> kvm_mmu_unload(vcpu); >> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >> @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> int r; >> struct kvm *kvm = vcpu->kvm; >> >> - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { >> - pr_debug("vcpu %d received sipi with vector # %x\n", >> - vcpu->vcpu_id, vcpu->arch.sipi_vector); >> - kvm_lapic_reset(vcpu); >> - kvm_vcpu_reset(vcpu); >> - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >> - } >> - >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> r = vapic_enter(vcpu); >> if (r) { >> @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); >> kvm_vcpu_block(vcpu); >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) >> - { >> + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { >> + kvm_apic_accept_events(vcpu); >> switch(vcpu->arch.mp_state) { >> case KVM_MP_STATE_HALTED: >> vcpu->arch.mp_state = >> @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> case KVM_MP_STATE_RUNNABLE: >> vcpu->arch.apf.halted = false; >> break; >> - case KVM_MP_STATE_SIPI_RECEIVED: >> + case KVM_MP_STATE_INIT_RECEIVED: >> + break; >> default: >> r = -EINTR; >> break; >> @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >> ++vcpu->stat.request_irq_exits; >> } >> >> - kvm_check_async_pf_completion(vcpu); >> + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) >> + kvm_check_async_pf_completion(vcpu); > Do not understand why is this 'if' needed. To prevent spurious async-pf event injection and mp-state corruption. See the other mail thread with Paolo on how to avoid this, potentially. Jan -- Siemens AG, Corporate Technology, CT RTC ITP SDP-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html