Implementation of PV EOI using shared memory. This reduces the number of exits an interrupt causes as much as by half. The idea is simple: there's a bit, per APIC, in guest memory, that tells the guest that it does not need EOI. We set it before injecting an interrupt and clear before injecting a nested one. Guest tests it using a test and clear operation - this is necessary so that host can detect interrupt nesting - and if set, it can skip the EOI MSR. There's a new MSR to set the address of said register in guest memory. Otherwise not much changed: - Guest EOI is not required - Register is tested & ISR is automatically cleared on exit For testing results see description of previous patch 'kvm_para: guest side for eoi avoidance'. Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 12 ++++ arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/lapic.c | 137 +++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 2 + arch/x86/kvm/trace.h | 34 ++++++++++ arch/x86/kvm/x86.c | 5 ++ 7 files changed, 187 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32297f2..3ded418 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -174,6 +174,13 @@ enum { /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 /* * We don't want allocation failures within the mmu code, so we preallocate @@ -485,6 +492,11 @@ struct kvm_vcpu_arch { u64 length; u64 status; } osvw; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index eba8345..a9f155a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); if (sched_info_on()) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7e06ba1..07bdfab 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.pending; - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ + if (kvm_apic_has_interrupt(v) < 0) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); /* PIC */ return s->output; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c1574..b4f7013 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) irq->level, irq->trig_mode); } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; @@ -482,16 +530,26 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; + /* + * It's legal for guest to ignore the PV EOI optimization + * and signal EOI by APIC write. If this happens, clear + * PV EOI on guest's behalf. + */ + if (pv_eoi_enabled(apic->vcpu)) + pv_eoi_clr_pending(apic->vcpu); apic_clear_vector(vector, apic->regs + APIC_ISR); apic_update_ppr(apic); @@ -505,6 +563,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1085,6 +1144,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1211,9 +1271,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) apic_update_ppr(apic); highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + if (highest_irr == -1) return -1; + if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + return -2; return highest_irr; } @@ -1245,9 +1306,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; - if (vector == -1) + /* Detect interrupt nesting and disable EOI optimization */ + if (pv_eoi_enabled(vcpu) && vector == -2) + pv_eoi_clr_pending(vcpu); + + if (vector < 0) return -1; + if (pv_eoi_enabled(vcpu)) { + if (kvm_ioapic_handles_vector(vcpu->kvm, vector)) + pv_eoi_clr_pending(vcpu); + else + pv_eoi_set_pending(vcpu); + } + apic_set_vector(vector, apic->regs + APIC_ISR); apic_update_ppr(apic); apic_clear_irr(vector, apic); @@ -1262,6 +1334,18 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) MSR_IA32_APICBASE_BASE; kvm_apic_set_version(vcpu); + if (pv_eoi_enabled(apic->vcpu)) { + /* + * Update shadow bit in apic_attention bitmask + * from guest memory. + */ + if (pv_eoi_get_pending(apic->vcpu)) + __set_bit(KVM_APIC_PV_EOI_PENDING, + &vcpu->arch.apic_attention); + else + __clear_bit(KVM_APIC_PV_EOI_PENDING, + &vcpu->arch.apic_attention); + } apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); @@ -1283,11 +1367,42 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_update_pv_eoi - called on vmexit + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + */ +static void apic_update_pv_eoi(struct kvm_lapic *apic) +{ + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + int vector; + BUG_ON(!pv_eoi_enabled(apic->vcpu)); + if (pv_eoi_get_pending(apic->vcpu)) + return; + __clear_bit(KVM_APIC_PV_EOI_PENDING, &apic->vcpu->arch.apic_attention); + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; void *vapic; + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_update_pv_eoi(vcpu->arch.apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; @@ -1394,3 +1509,15 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (pv_eoi_enabled(vcpu)) + pv_eoi_clr_pending(vcpu); + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce25..149ef01 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d264..851914e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185a2b8..4d77af0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -795,6 +795,7 @@ static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html