On Sun, Apr 15, 2012 at 07:18:58PM +0300, Michael S. Tsirkin wrote: > I got lots of useful feedback from v0 so I thought > sending out a brain dump again would be a good idea. > This is mainly to show how I'm trying to address the > comments I got from the previous round. Flames/feedback > are wellcome! > > Changes from v0: > - Tweaked setup MSRs a bit > - Keep ISR bit set. Before reading ISR, test EOI in guest memory and clear > - Check priority for nested interrupts, we can > enable optimization if new one is high priority > - Disable optimization for any interrupt handled by ioapic > (This is because ioapic handles notifiers and pic and it generally gets messy. > It's possible that we can optimize some ioapic-handled > edge interrupts - but is it worth it?) > - A less intrusive change for guest apic (0 overhead without kvm) > > --- > > I took a stub at implementing PV EOI using shared memory. > This should reduce the number of exits an interrupt > causes as much as by half. > > A partially complete draft for both host and guest parts > is below. > > The idea is simple: there's a bit, per APIC, in guest memory, > that tells the guest that it does not need EOI. > We set it before injecting an interrupt and clear > before injecting a nested one. Guest tests it using > a test and clear operation - this is necessary > so that host can detect interrupt nesting - > and if set, it can skip the EOI MSR. > > There's a new MSR to set the address of said register > in guest memory. Otherwise not much changed: > - Guest EOI is not required > - Register is tested ISR is automatically cleared before injection > > qemu support is incomplete - mostly for feature negotiation. > need to add some trace points to enable profiling. > No testing was done beyond compiling the kernel. > > Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx> > > diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h > index b97596e..c9c70ea 100644 > --- a/arch/x86/include/asm/bitops.h > +++ b/arch/x86/include/asm/bitops.h > @@ -26,11 +26,13 @@ > #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) > /* Technically wrong, but this avoids compilation errors on some gcc > versions. */ > -#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) > +#define BITOP_ADDR_CONSTRAINT "=m" > #else > -#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) > +#define BITOP_ADDR_CONSTRAINT "+m" > #endif > > +#define BITOP_ADDR(x) BITOP_ADDR_CONSTRAINT (*(volatile long *) (x)) > + > #define ADDR BITOP_ADDR(addr) > > /* > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index e216ba0..3d09ef1 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -481,6 +481,12 @@ struct kvm_vcpu_arch { > u64 length; > u64 status; > } osvw; > + > + struct { > + u64 msr_val; > + struct gfn_to_hva_cache data; > + bool pending; > + } eoi; > }; > > struct kvm_lpage_info { > diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h > index 734c376..164376a 100644 > --- a/arch/x86/include/asm/kvm_para.h > +++ b/arch/x86/include/asm/kvm_para.h > @@ -22,6 +22,7 @@ > #define KVM_FEATURE_CLOCKSOURCE2 3 > #define KVM_FEATURE_ASYNC_PF 4 > #define KVM_FEATURE_STEAL_TIME 5 > +#define KVM_FEATURE_EOI 6 > > /* The last 8 bits are used to indicate how to interpret the flags field > * in pvclock structure. If no bits are set, all flags are ignored. > @@ -37,6 +38,8 @@ > #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 > #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 > #define MSR_KVM_STEAL_TIME 0x4b564d03 > +#define MSR_KVM_EOI_EN 0x4b564d04 > +#define MSR_KVM_EOI_DISABLED 0x0L This is valid gpa. Follow others MSR example i.e align the address to, lets say dword, and use lsb as enable bit. Other low bits are reserved (#GP if set). And please move defines that not define new MSRs to some other place. > > struct kvm_steal_time { > __u64 steal; > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index b8ba6e4..450aae4 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -39,6 +39,7 @@ > #include <asm/desc.h> > #include <asm/tlbflush.h> > #include <asm/idle.h> > +#include <asm/apic.h> > > static int kvmapf = 1; > > @@ -290,6 +291,33 @@ static void kvm_register_steal_time(void) > cpu, __pa(st)); > } > > +/* TODO: needs to be early? aligned? */ > +static DEFINE_EARLY_PER_CPU(u8, apic_eoi, 0); > + > +/* Our own copy of __test_and_clear_bit to make sure > + * it is done with a single instruction */ > +static inline int kvm_test_and_clear_bit(int nr, volatile u8* addr) > +{ > + int oldbit; > + > + asm volatile("btr %2,%1\n\t" > + "sbb %0,%0" > + : "=r" (oldbit), > + BITOP_ADDR_CONSTRAINT (*(volatile u8 *) (addr)) > + : "Ir" (nr)); > + return oldbit; > +} > + > +static void (*kvm_guest_native_apic_write)(u32 reg, u32 val); > +static void kvm_guest_apic_write(u32 reg, u32 val) > +{ > + if (reg == APIC_EOI && > + kvm_test_and_clear_bit(0, &__get_cpu_var(apic_eoi))) > + return; > + > + kvm_guest_native_apic_write(reg, val); > +} > + > void __cpuinit kvm_guest_cpu_init(void) > { > if (!kvm_para_available()) > @@ -307,11 +335,18 @@ void __cpuinit kvm_guest_cpu_init(void) > smp_processor_id()); > } > > + > + if (kvm_para_has_feature(KVM_FEATURE_EOI)) { > + kvm_guest_native_apic_write = apic->write; > + apic->write = kvm_guest_apic_write; > + wrmsrl(MSR_KVM_EOI_EN, __pa(&__get_cpu_var(apic_eoi))); > + } Can happen on more then one cpu. After it happens on two kvm_guest_apic_write() will call to itself. > + > if (has_steal_clock) > kvm_register_steal_time(); > } > > -static void kvm_pv_disable_apf(void *unused) > +static void kvm_pv_disable_apf(void) > { > if (!__get_cpu_var(apf_reason).enabled) > return; > @@ -323,11 +358,18 @@ static void kvm_pv_disable_apf(void *unused) > smp_processor_id()); > } > > +static void kvm_pv_guest_cpu_reboot(void *unused) > +{ > + if (kvm_para_has_feature(KVM_FEATURE_EOI)) > + wrmsrl(MSR_KVM_EOI_EN, MSR_KVM_EOI_DISABLED); Shouldn't it zero apic_eoi? > + kvm_pv_disable_apf(); > +} > + > static int kvm_pv_reboot_notify(struct notifier_block *nb, > unsigned long code, void *unused) > { > if (code == SYS_RESTART) > - on_each_cpu(kvm_pv_disable_apf, NULL, 1); > + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); > return NOTIFY_DONE; > } > > @@ -378,7 +420,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) > static void kvm_guest_cpu_offline(void *dummy) > { > kvm_disable_steal_time(); > - kvm_pv_disable_apf(NULL); > + if (kvm_para_has_feature(KVM_FEATURE_EOI)) > + wrmsrl(MSR_KVM_EOI_EN, MSR_KVM_EOI_DISABLED); Same. > + kvm_pv_disable_apf(); > apf_task_wake_all(); > } > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index 9fed5be..7d00d2d 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > (1 << KVM_FEATURE_NOP_IO_DELAY) | > (1 << KVM_FEATURE_CLOCKSOURCE2) | > (1 << KVM_FEATURE_ASYNC_PF) | > + (1 << KVM_FEATURE_EOI) | > (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); > > if (sched_info_on()) > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c > index 7e06ba1..07bdfab 100644 > --- a/arch/x86/kvm/irq.c > +++ b/arch/x86/kvm/irq.c > @@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) > if (!irqchip_in_kernel(v->kvm)) > return v->arch.interrupt.pending; > > - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ > + if (kvm_apic_has_interrupt(v) < 0) { /* LAPIC */ > if (kvm_apic_accept_pic_intr(v)) { > s = pic_irqchip(v->kvm); /* PIC */ > return s->output; > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 992b4ea..c2118ab 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) > irq->level, irq->trig_mode); > } > > +static int eoi_put_user(struct kvm_vcpu *vcpu, u8 val) > +{ > + > + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, &val, > + sizeof(val)); > +} > + > +static int eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) > +{ > + > + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, val, > + sizeof(*val)); > +} > + > +static inline bool eoi_enabled(struct kvm_vcpu *vcpu) > +{ > + return vcpu->arch.eoi.msr_val != MSR_KVM_EOI_DISABLED; > +} > + > +static bool eoi_get_pending(struct kvm_vcpu *vcpu) > +{ > + u8 val; > + if (eoi_get_user(vcpu, &val) < 0) > + apic_debug("Can't read EOI MSR value: 0x%llx\n", > + (unsigned long long)vcpi->arch.eoi.msr_val); > + return val & 0x1; > +} > + > +static void eoi_set_pending(struct kvm_vcpu *vcpu) > +{ > + if (eoi_put_user(vcpu, 0x1) < 0) { > + apic_debug("Can't set EOI MSR value: 0x%llx\n", > + (unsigned long long)vcpi->arch.eoi.msr_val); > + return; > + } > + vcpu->arch.eoi.pending = true; > +} > + > +static void eoi_clr_pending(struct kvm_vcpu *vcpu) > +{ > + if (eoi_put_user(vcpu, 0x0) < 0) { > + apic_debug("Can't clear EOI MSR value: 0x%llx\n", > + (unsigned long long)vcpi->arch.eoi.msr_val); > + return; > + } > + vcpu->arch.eoi.pending = false; > +} > + > static inline int apic_find_highest_isr(struct kvm_lapic *apic) > { > int result; > @@ -280,6 +328,20 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) > return result; > } > > +static void apic_update_isr(struct kvm_lapic *apic) > +{ > + int vector; > + if (!eoi_enabled(apic->vcpu) || > + !apic->vcpu->arch.eoi.pending || > + eoi_get_pending(apic->vcpu)) > + return; > + apic->vcpu->arch.eoi.pending = false; > + vector = apic_find_highest_isr(apic); > + if (vector == -1) > + return; > + apic_clear_vector(vector, apic->regs + APIC_ISR); > +} > + We should just call apic_set_eoi() on exit if eoi.pending && !eoi_get_pending(). This removes the need for the function and its calls. We already have call to kvm_lapic_sync_from_vapic() on exit path which should be extended to do the above. > static void apic_update_ppr(struct kvm_lapic *apic) > { > u32 tpr, isrv, ppr, old_ppr; > @@ -287,6 +349,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) > > old_ppr = apic_get_reg(apic, APIC_PROCPRI); > tpr = apic_get_reg(apic, APIC_TASKPRI); > + apic_update_isr(apic); > isr = apic_find_highest_isr(apic); > isrv = (isr != -1) ? isr : 0; > > @@ -492,6 +555,8 @@ static void apic_set_eoi(struct kvm_lapic *apic) > if (vector == -1) > return; > > + if (eoi_enabled(apic->vcpu)) > + eoi_clr_pending(apic->vcpu); You have many of those. Just fold eoi_enabled() check into eoi_clr_pending(). > apic_clear_vector(vector, apic->regs + APIC_ISR); > apic_update_ppr(apic); > > @@ -1085,6 +1150,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) > atomic_set(&apic->lapic_timer.pending, 0); > if (kvm_vcpu_is_bsp(vcpu)) > vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; > + vcpu->arch.eoi.msr_val = MSR_KVM_EOI_DISABLED; > + vcpu->arch.eoi.pending = false; > apic_update_ppr(apic); > > vcpu->arch.apic_arb_prio = 0; > @@ -1210,9 +1277,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) > > apic_update_ppr(apic); > highest_irr = apic_find_highest_irr(apic); > - if ((highest_irr == -1) || > - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) > + if (highest_irr == -1) > return -1; > + if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) > + return -2; > return highest_irr; > } > > @@ -1244,9 +1312,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) > int vector = kvm_apic_has_interrupt(vcpu); > struct kvm_lapic *apic = vcpu->arch.apic; > > - if (vector == -1) > + /* Detect interrupt nesting and disable EOI optimization */ > + if (eoi_enabled(vcpu) && vector == -2) > + eoi_clr_pending(vcpu); > + > + if (vector < 0) > return -1; > > + if (eoi_enabled(vcpu)) { > + if (kvm_ioapic_handles_vector(vcpu->kvm, vector)) > + eoi_clr_pending(vcpu); > + else > + eoi_set_pending(vcpu); > + } > + > apic_set_vector(vector, apic->regs + APIC_ISR); > apic_update_ppr(apic); > apic_clear_irr(vector, apic); > @@ -1261,6 +1340,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) > MSR_IA32_APICBASE_BASE; > kvm_apic_set_version(vcpu); > > + if (eoi_enabled(apic->vcpu)) > + apic->vcpu->arch.eoi.pending = eoi_get_pending(apic->vcpu); > apic_update_ppr(apic); > hrtimer_cancel(&apic->lapic_timer.timer); > update_divide_count(apic); > @@ -1392,3 +1473,18 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) > > return 0; > } > + > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data) > +{ > + if (data == MSR_KVM_EOI_DISABLED) { > + struct kvm_lapic *apic = vcpu->arch.apic; > + if (apic && apic_enabled(apic)) > + apic_update_isr(apic); > + } else if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.eoi.data, > + data)) > + return 1; > + > + vcpu->arch.eoi.msr_val = data; > + vcpu->arch.eoi.pending = false; > + return 0; > +} > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 6f4ce25..042dace 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) > { > return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; > } > + > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data); > #endif > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 4044ce0..31d6762 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1627,6 +1627,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) > if (kvm_pv_enable_async_pf(vcpu, data)) > return 1; > break; > + case MSR_KVM_EOI_EN: > + if (kvm_pv_enable_apic_eoi(vcpu, data)) > + return 1; > + break; > case MSR_KVM_STEAL_TIME: > > if (unlikely(!sched_info_on())) > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html