Il 13/03/2013 12:42, Jan Kiszka ha scritto: > A VCPU sending INIT or SIPI to some other VCPU races for setting the > remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED > was overwritten by kvm_emulate_halt and, thus, got lost. > > This introduces APIC events for those two signals, keeping them in > kvm_apic until kvm_apic_accept_events is run over the target vcpu > context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there > are pending events, thus if vcpu blocking should end. > > The patch comes with the side effect of effectively obsoleting > KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but > immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. > The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED > state. That also means we no longer exit to user space after receiving a > SIPI event. > > Furthermore, we already reset the VCPU on INIT, only fixing up the code > segment later on when SIPI arrives. Moreover, we fix INIT handling for > the BSP: it never enter wait-for-SIPI but directly starts over on INIT. > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > > Changes in v2: > - drop cmpxchg from INIT signaling > - rmb on SIPI processing > - reset sets all CPUs to 0xf000:0xfff0, RIP is fixed up on SIPI reception > > arch/x86/include/asm/kvm_host.h | 3 +- > arch/x86/kvm/lapic.c | 48 +++++++++++++++++++++++++++----- > arch/x86/kvm/lapic.h | 11 +++++++ > arch/x86/kvm/svm.c | 6 ---- > arch/x86/kvm/vmx.c | 12 +------- > arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++------------- > 6 files changed, 93 insertions(+), 45 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 348d859..ef7f4a5 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -345,7 +345,6 @@ struct kvm_vcpu_arch { > unsigned long apic_attention; > int32_t apic_arb_prio; > int mp_state; > - int sipi_vector; > u64 ia32_misc_enable_msr; > bool tpr_access_reporting; > > @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); > > void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); > int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); > +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); > > int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, > int reason, bool has_error_code, u32 error_code); > @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); > int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); > int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); > int kvm_cpu_get_interrupt(struct kvm_vcpu *v); > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > > void kvm_define_shared_msr(unsigned index, u32 msr); > void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 02b51dd..a8e9369 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_INIT: > if (!trig_mode || level) { > result = 1; > - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + /* assumes that there are only KVM_APIC_INIT/SIPI */ > + apic->pending_events = (1UL << KVM_APIC_INIT); > + /* make sure pending_events is visible before sending > + * the request */ > + smp_wmb(); > kvm_make_request(KVM_REQ_EVENT, vcpu); > kvm_vcpu_kick(vcpu); > } else { > @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, > case APIC_DM_STARTUP: > apic_debug("SIPI to vcpu %d vector 0x%02x\n", > vcpu->vcpu_id, vector); > - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > - result = 1; > - vcpu->arch.sipi_vector = vector; > - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; > - kvm_make_request(KVM_REQ_EVENT, vcpu); > - kvm_vcpu_kick(vcpu); > - } > + result = 1; > + apic->sipi_vector = vector; > + /* make sure sipi_vector is visible for the receiver */ > + smp_wmb(); > + set_bit(KVM_APIC_SIPI, &apic->pending_events); > + kvm_make_request(KVM_REQ_EVENT, vcpu); > + kvm_vcpu_kick(vcpu); > break; > > case APIC_DM_EXTINT: > @@ -1860,6 +1864,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) > addr); > } > > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + unsigned int sipi_vector; > + > + if (!kvm_vcpu_has_lapic(vcpu)) > + return; > + > + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { > + kvm_lapic_reset(vcpu); > + kvm_vcpu_reset(vcpu); > + if (kvm_vcpu_is_bsp(apic->vcpu)) > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > + else > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + } > + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && > + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + /* evaluate pending_events before reading the vector */ > + smp_rmb(); > + sipi_vector = apic->sipi_vector; > + pr_debug("vcpu %d received sipi with vector # %x\n", > + vcpu->vcpu_id, sipi_vector); > + kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > + } > +} > + > void kvm_lapic_init(void) > { > /* do not patch jump label more than once per second */ > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 1676d34..2c721b9 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -5,6 +5,9 @@ > > #include <linux/kvm_host.h> > > +#define KVM_APIC_INIT 0 > +#define KVM_APIC_SIPI 1 > + > struct kvm_timer { > struct hrtimer timer; > s64 period; /* unit: ns */ > @@ -32,6 +35,8 @@ struct kvm_lapic { > void *regs; > gpa_t vapic_addr; > struct page *vapic_page; > + unsigned long pending_events; > + unsigned int sipi_vector; > }; > int kvm_create_lapic(struct kvm_vcpu *vcpu); > void kvm_free_lapic(struct kvm_vcpu *vcpu); > @@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); > int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); > int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); > int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); > +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); > void kvm_lapic_reset(struct kvm_vcpu *vcpu); > u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); > void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); > @@ -158,4 +164,9 @@ void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, > struct kvm_lapic_irq *irq, > u64 *eoi_bitmap); > > +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) > +{ > + return vcpu->arch.apic->pending_events; > +} > + > #endif > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index 907e428..7219a40 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1199,12 +1199,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu) > > init_vmcb(svm); > > - if (!kvm_vcpu_is_bsp(vcpu)) { > - kvm_rip_write(vcpu, 0); > - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; > - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; > - } > - > kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); > kvm_register_write(vcpu, VCPU_REGS_RDX, eax); > } > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index f17cd2a..b73989d 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -4119,12 +4119,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) > vmx_segment_cache_clear(vmx); > > seg_setup(VCPU_SREG_CS); > - if (kvm_vcpu_is_bsp(&vmx->vcpu)) > - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); > - else { > - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); > - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); > - } > + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); > > seg_setup(VCPU_SREG_DS); > seg_setup(VCPU_SREG_ES); > @@ -4147,10 +4142,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) > vmcs_writel(GUEST_SYSENTER_EIP, 0); > > vmcs_writel(GUEST_RFLAGS, 0x02); > - if (kvm_vcpu_is_bsp(&vmx->vcpu)) > - kvm_rip_write(vcpu, 0xfff0); > - else > - kvm_rip_write(vcpu, 0); > + kvm_rip_write(vcpu, 0xfff0); > > vmcs_writel(GUEST_GDTR_BASE, 0); > vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b891ac3..a7e51ae 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; > > static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); > - > static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) > { > int i; > @@ -2823,10 +2821,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, > events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); > events->nmi.pad = 0; > > - events->sipi_vector = vcpu->arch.sipi_vector; > + events->sipi_vector = 0; /* never valid when reporting to user space */ > > events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING > - | KVM_VCPUEVENT_VALID_SIPI_VECTOR > | KVM_VCPUEVENT_VALID_SHADOW); > memset(&events->reserved, 0, sizeof(events->reserved)); > } > @@ -2857,8 +2854,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, > vcpu->arch.nmi_pending = events->nmi.pending; > kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); > > - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) > - vcpu->arch.sipi_vector = events->sipi_vector; > + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && > + kvm_vcpu_has_lapic(vcpu)) > + vcpu->arch.apic->sipi_vector = events->sipi_vector; > > kvm_make_request(KVM_REQ_EVENT, vcpu); > > @@ -5713,6 +5711,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > } > > if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { > + kvm_apic_accept_events(vcpu); > + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { > + r = 1; > + goto out; > + } > + > inject_pending_event(vcpu); > > /* enable NMI/IRQ window open exits if needed */ > @@ -5847,14 +5851,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > int r; > struct kvm *kvm = vcpu->kvm; > > - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { > - pr_debug("vcpu %d received sipi with vector # %x\n", > - vcpu->vcpu_id, vcpu->arch.sipi_vector); > - kvm_lapic_reset(vcpu); > - kvm_vcpu_reset(vcpu); > - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > - } > - > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > r = vapic_enter(vcpu); > if (r) { > @@ -5871,8 +5867,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); > kvm_vcpu_block(vcpu); > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) > - { > + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { > + kvm_apic_accept_events(vcpu); > switch(vcpu->arch.mp_state) { > case KVM_MP_STATE_HALTED: > vcpu->arch.mp_state = > @@ -5880,7 +5876,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) > case KVM_MP_STATE_RUNNABLE: > vcpu->arch.apf.halted = false; > break; > - case KVM_MP_STATE_SIPI_RECEIVED: > + case KVM_MP_STATE_INIT_RECEIVED: > + break; > default: > r = -EINTR; > break; > @@ -6015,6 +6012,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { > kvm_vcpu_block(vcpu); > + kvm_apic_accept_events(vcpu); > clear_bit(KVM_REQ_UNHALT, &vcpu->requests); > r = -EAGAIN; > goto out; > @@ -6171,6 +6169,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > + kvm_apic_accept_events(vcpu); > mp_state->mp_state = vcpu->arch.mp_state; > return 0; > } > @@ -6178,7 +6177,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, > int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, > struct kvm_mp_state *mp_state) > { > - vcpu->arch.mp_state = mp_state->mp_state; > + if (!kvm_vcpu_has_lapic(vcpu) && > + mp_state->mp_state != KVM_MP_STATE_RUNNABLE) > + return -EINVAL; > + > + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { > + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; > + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); > + } else > + vcpu->arch.mp_state = mp_state->mp_state; > kvm_make_request(KVM_REQ_EVENT, vcpu); > return 0; > } > @@ -6515,7 +6522,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_x86_ops->vcpu_free(vcpu); > } > > -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > { > atomic_set(&vcpu->arch.nmi_queued, 0); > vcpu->arch.nmi_pending = 0; > @@ -6545,6 +6552,17 @@ static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) > kvm_x86_ops->vcpu_reset(vcpu); > } > > +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) > +{ > + struct kvm_segment cs; > + > + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); > + cs.selector = vector << 8; > + cs.base = vector << 12; > + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); > + kvm_rip_write(vcpu, 0); > +} > + > int kvm_arch_hardware_enable(void *garbage) > { > struct kvm *kvm; > @@ -6988,7 +7006,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) > return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && > !vcpu->arch.apf.halted) > || !list_empty_careful(&vcpu->async_pf.done) > - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED > + || kvm_apic_has_events(vcpu) > || atomic_read(&vcpu->arch.nmi_queued) || > (kvm_arch_interrupt_allowed(vcpu) && > kvm_cpu_has_interrupt(vcpu)); > With the following hack to inject an INIT and corresponding QEMU changes: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2609e7b99..efdab35 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6182,8 +6182,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); - } else + } else { + if (kvm_vcpu_has_lapic(vcpu)) { + if (mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED && + kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.apic->pending_events = 1UL << KVM_APIC_INIT; + else + vcpu->arch.apic->pending_events = 0; + } vcpu->arch.mp_state = mp_state->mp_state; + } kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } ... this patch passes the init.flat kvm-unit-test. Tested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html