On 2013-03-12 13:58, Jan Kiszka wrote: > On 2013-03-12 13:06, Paolo Bonzini wrote: >> Il 12/03/2013 12:44, Jan Kiszka ha scritto: >>> A VCPU sending INIT or SIPI to some other VCPU races for setting the >>> remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED >>> was overwritten by kvm_emulate_halt and, thus, got lost. >>> >>> This introduces APIC events for those two signals, keeping them in >>> kvm_apic until kvm_apic_accept_events is run over the target vcpu >>> context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there >>> are pending events, thus if vcpu blocking should end. >>> >>> The patch comes with the side effect of effectively obsoleting >>> KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but >>> immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. >>> The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED >>> state. That also means we no longer exit to user space after receiving a >>> SIPI event. >>> >>> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> >>> --- >>> >>> This doesn't fix the wrong behaviour on INIT for the BSP yet. Leaving >>> this honor to Paolo. >>> >>> I didn't try porting any INIT handling for nested on top yet but I >>> think it should be feasible - once we know their semantics for sure, at >>> least on Intel... >>> >>> arch/x86/include/asm/kvm_host.h | 1 + >>> arch/x86/kvm/lapic.c | 41 +++++++++++++++++++++++++++++++------- >>> arch/x86/kvm/lapic.h | 7 ++++++ >>> arch/x86/kvm/x86.c | 39 ++++++++++++++++++++---------------- >>> 4 files changed, 63 insertions(+), 25 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>> index 348d859..2d28e76 100644 >>> --- a/arch/x86/include/asm/kvm_host.h >>> +++ b/arch/x86/include/asm/kvm_host.h >>> @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); >>> int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); >>> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); >>> int kvm_cpu_get_interrupt(struct kvm_vcpu *v); >>> +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >>> >>> void kvm_define_shared_msr(unsigned index, u32 msr); >>> void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); >>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >>> index 02b51dd..4a21a6b 100644 >>> --- a/arch/x86/kvm/lapic.c >>> +++ b/arch/x86/kvm/lapic.c >>> @@ -731,7 +731,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_INIT: >>> if (!trig_mode || level) { >>> result = 1; >>> - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> + set_bit(KVM_APIC_INIT, &apic->pending_events); >> >> I think this should clear pending SIPIs, unless KVM_APIC_INIT was >> already set in which case it should be a no-op. Something like: >> >> e = apic->pending_events; >> while (!(e & KVM_APIC_INIT)) >> e = cmpxchg(&apic->pending_events, e, >> (e | KVM_APIC_INIT) & ~KVM_APIC_SIPI); >> >> If you do this, better make pending_events an atomic_t. > > OK (looks ugly but necessary). > >> >>> kvm_make_request(KVM_REQ_EVENT, vcpu); >>> kvm_vcpu_kick(vcpu); >>> } else { >>> @@ -743,13 +743,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >>> case APIC_DM_STARTUP: >>> apic_debug("SIPI to vcpu %d vector 0x%02x\n", >>> vcpu->vcpu_id, vector); >>> - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> - result = 1; >>> - vcpu->arch.sipi_vector = vector; >>> - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; >>> - kvm_make_request(KVM_REQ_EVENT, vcpu); >>> - kvm_vcpu_kick(vcpu); >>> - } >>> + result = 1; >>> + apic->sipi_vector = vector; >>> + /* make sure sipi_vector is visible for the receiver */ >>> + smp_wmb(); >>> + set_bit(KVM_APIC_SIPI, &apic->pending_events); >>> + kvm_make_request(KVM_REQ_EVENT, vcpu); >>> + kvm_vcpu_kick(vcpu); >>> break; >>> >>> case APIC_DM_EXTINT: >>> @@ -1860,6 +1860,31 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) >>> addr); >>> } >>> >>> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu) >>> +{ >>> + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; >>> +} >>> + >>> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) >>> +{ >>> + struct kvm_lapic *apic = vcpu->arch.apic; >>> + >>> + if (!kvm_vcpu_has_lapic(vcpu)) >>> + return; >>> + >>> + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) >>> + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; >>> + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && >>> + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> + vcpu->arch.sipi_vector = apic->sipi_vector; >>> + pr_debug("vcpu %d received sipi with vector # %x\n", >>> + vcpu->vcpu_id, vcpu->arch.sipi_vector); >>> + kvm_lapic_reset(vcpu); >>> + kvm_vcpu_reset(vcpu); >>> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >>> + } >>> +} >>> + >>> void kvm_lapic_init(void) >>> { >>> /* do not patch jump label more than once per second */ >>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >>> index 1676d34..ef3f4ef 100644 >>> --- a/arch/x86/kvm/lapic.h >>> +++ b/arch/x86/kvm/lapic.h >>> @@ -5,6 +5,9 @@ >>> >>> #include <linux/kvm_host.h> >>> >>> +#define KVM_APIC_INIT 0 >>> +#define KVM_APIC_SIPI 1 >>> + >>> struct kvm_timer { >>> struct hrtimer timer; >>> s64 period; /* unit: ns */ >>> @@ -32,13 +35,17 @@ struct kvm_lapic { >>> void *regs; >>> gpa_t vapic_addr; >>> struct page *vapic_page; >>> + unsigned long pending_events; >>> + unsigned int sipi_vector; >>> }; >>> int kvm_create_lapic(struct kvm_vcpu *vcpu); >>> void kvm_free_lapic(struct kvm_vcpu *vcpu); >>> >>> int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); >>> +bool kvm_apic_has_events(struct kvm_vcpu *vcpu); >>> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); >>> int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); >>> +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); >>> void kvm_lapic_reset(struct kvm_vcpu *vcpu); >>> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); >>> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index b891ac3..a0b8041 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; >>> >>> static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); >>> >>> -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); >>> - >>> static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) >>> { >>> int i; >>> @@ -5663,6 +5661,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >>> bool req_immediate_exit = 0; >>> >>> if (vcpu->requests) { >>> + kvm_apic_accept_events(vcpu); >>> + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { >>> + r = 1; >>> + goto out; >>> + } >>> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >>> kvm_mmu_unload(vcpu); >>> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >>> @@ -5847,14 +5850,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> int r; >>> struct kvm *kvm = vcpu->kvm; >>> >>> - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { >>> - pr_debug("vcpu %d received sipi with vector # %x\n", >>> - vcpu->vcpu_id, vcpu->arch.sipi_vector); >>> - kvm_lapic_reset(vcpu); >>> - kvm_vcpu_reset(vcpu); >>> - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >>> - } >>> - >>> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >>> r = vapic_enter(vcpu); >>> if (r) { >>> @@ -5871,8 +5866,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); >>> kvm_vcpu_block(vcpu); >>> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >>> - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) >>> - { >>> + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { >>> + kvm_apic_accept_events(vcpu); >>> switch(vcpu->arch.mp_state) { >>> case KVM_MP_STATE_HALTED: >>> vcpu->arch.mp_state = >>> @@ -5880,7 +5875,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> case KVM_MP_STATE_RUNNABLE: >>> vcpu->arch.apf.halted = false; >>> break; >>> - case KVM_MP_STATE_SIPI_RECEIVED: >>> + case KVM_MP_STATE_INIT_RECEIVED: >>> + break; >>> default: >>> r = -EINTR; >>> break; >>> @@ -5901,7 +5897,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) >>> ++vcpu->stat.request_irq_exits; >>> } >>> >>> - kvm_check_async_pf_completion(vcpu); >>> + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) >>> + kvm_check_async_pf_completion(vcpu); >> >> Can you instead do another change that affects the conditions of >> kvm_check_async_pf_completion? >> >> For example, should kvm_arch_interrupt_allowed return zero if the VCPU >> is in the INIT_RECEIVED state? > > Yeah, that probably makes sense beyond async_pf. Wait: If you perform a proper reset on INIT already, we would clear IF thus prevent also async_pf injections. On the other hand, kvm_arch_can_inject_async_page_present returns true if apf.msr_val & KVM_ASYNC_PF_ENABLED is not set - shouldn't that be cleared on reset as well? Hmm... Jan -- Siemens AG, Corporate Technology, CT RTC ITP SDP-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html