On Tue, Apr 10, 2012 at 08:59:21PM +0300, Gleb Natapov wrote: > Heh, I was working on that too. > > On Tue, Apr 10, 2012 at 05:26:18PM +0300, Michael S. Tsirkin wrote: > > On Tue, Apr 10, 2012 at 05:03:22PM +0300, Avi Kivity wrote: > > > On 04/10/2012 04:27 PM, Michael S. Tsirkin wrote: > > > > I took a stub at implementing PV EOI using shared memory. > > > > This should reduce the number of exits an interrupt > > > > causes as much as by half. > > > > > > > > A partially complete draft for both host and guest parts > > > > is below. > > > > > > > > The idea is simple: there's a bit, per APIC, in guest memory, > > > > that tells the guest that it does not need EOI. > > > > We set it before injecting an interrupt and clear > > > > before injecting a nested one. Guest tests it using > > > > a test and clear operation - this is necessary > > > > so that host can detect interrupt nesting - > > > > and if set, it can skip the EOI MSR. > > > > > > > > There's a new MSR to set the address of said register > > > > in guest memory. Otherwise not much changes: > > > > - Guest EOI is not required > > > > - ISR is automatically cleared before injection > > > > > > > > Some things are incomplete: add feature negotiation > > > > options, qemu support for said options. > > > > No testing was done beyond compiling the kernel. > > > > > > > > I would appreciate early feedback. > > > > > > > > Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx> > > > > > > > > -- > > > > > > > > diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h > > > > index d854101..8430f41 100644 > > > > --- a/arch/x86/include/asm/apic.h > > > > +++ b/arch/x86/include/asm/apic.h > > > > @@ -457,8 +457,13 @@ static inline u32 safe_apic_wait_icr_idle(void) { return 0; } > > > > > > > > #endif /* CONFIG_X86_LOCAL_APIC */ > > > > > > > > +DECLARE_EARLY_PER_CPU(unsigned long, apic_eoi); > > > > + > > > > static inline void ack_APIC_irq(void) > > > > { > > > > + if (__test_and_clear_bit(0, &__get_cpu_var(apic_eoi))) > > > > + return; > > > > + > > > > > > While __test_and_clear_bit() is implemented in a single instruction, > > > it's not required to be. Better have the instruction there explicitly. > > > > > > > /* > > > > * ack_APIC_irq() actually gets compiled as a single instruction > > > > * ... yummie. > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > > > index e216ba0..0ee1472 100644 > > > > --- a/arch/x86/include/asm/kvm_host.h > > > > +++ b/arch/x86/include/asm/kvm_host.h > > > > @@ -481,6 +481,12 @@ struct kvm_vcpu_arch { > > > > u64 length; > > > > u64 status; > > > > } osvw; > > > > + > > > > + struct { > > > > + u64 msr_val; > > > > + struct gfn_to_hva_cache data; > > > > + int vector; > > > > + } eoi; > > > > }; > > > > > > Needs to be cleared on INIT. > > > > You mean kvm_arch_vcpu_reset? > > > > > > > > > > > > > > @@ -307,6 +308,9 @@ void __cpuinit kvm_guest_cpu_init(void) > > > > smp_processor_id()); > > > > } > > > > > > > > + wrmsrl(MSR_KVM_EOI_EN, __pa(this_cpu_ptr(apic_eoi)) | > > > > + MSR_KVM_EOI_ENABLED); > > > > + > > > > > > Clear on kexec. > > > > With register_reboot_notifier? > > > > > > if (has_steal_clock) > > > > kvm_register_steal_time(); > > > > } > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > > > > index 8584322..9e38e12 100644 > > > > --- a/arch/x86/kvm/lapic.c > > > > +++ b/arch/x86/kvm/lapic.c > > > > @@ -265,7 +265,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) > > > > irq->level, irq->trig_mode); > > > > } > > > > > > > > -static inline int apic_find_highest_isr(struct kvm_lapic *apic) > > > > +static int eoi_put_user(struct kvm_vcpu *vcpu, u32 val) > > > > +{ > > > > + > > > > + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, &val, > > > > + sizeof(val)); > > > > +} > > > > + > > > > +static int eoi_get_user(struct kvm_vcpu *vcpu, u32 *val) > > > > +{ > > > > + > > > > + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, val, > > > > + sizeof(*val)); > > > > +} > > > > + > > > > +static inline bool eoi_enabled(struct kvm_vcpu *vcpu) > > > > +{ > > > > + return (vcpu->arch.eoi.msr_val & MSR_KVM_EOI_ENABLED); > > > > +} > > > > + > > > > +static int eoi_get_pending_vector(struct kvm_vcpu *vcpu) > > > > +{ > > > > + u32 val; > > > > + if (eoi_get_user(vcpu, &val) < 0) > > > > + apic_debug("Can't read EOI MSR value: 0x%llx\n", > > > > + (unsigned long long)vcpi->arch.eoi.msr_val); > > > > + if (!(val & 0x1)) > > > > + vcpu->arch.eoi.vector = -1; > > > > + return vcpu->arch.eoi.vector; > > > > +} > > > > + > > > > +static void eoi_set_pending_vector(struct kvm_vcpu *vcpu, int vector) > > > > +{ > > > > + BUG_ON(vcpu->arch.eoi.vector != -1); > > > > + if (eoi_put_user(vcpu, 0x1) < 0) { > > > > + apic_debug("Can't set EOI MSR value: 0x%llx\n", > > > > + (unsigned long long)vcpi->arch.eoi.msr_val); > > > > + return; > > > > + } > > > > + vcpu->arch.eoi.vector = vector; > > > > +} > > > > + > > > > +static int eoi_clr_pending_vector(struct kvm_vcpu *vcpu) > > > > +{ > > > > + int vector; > > > > + vector = vcpu->arch.eoi.vector; > > > > + if (vector != -1 && eoi_put_user(vcpu, 0x0) < 0) { > > > > + apic_debug("Can't clear EOI MSR value: 0x%llx\n", > > > > + (unsigned long long)vcpi->arch.eoi.msr_val); > > > > + return -1; > > > > + } > > > > + vcpu->arch.eoi.vector = -1; > > > > + return vector; > > > > +} > > > > > > > > > > > > > + > > > > +static inline int __apic_find_highest_isr(struct kvm_lapic *apic) > > > > { > > > > int result; > > > > > > > > @@ -275,6 +329,17 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) > > > > return result; > > > > } > > > > > > > > +static inline int apic_find_highest_isr(struct kvm_lapic *apic) > > > > +{ > > > > + int vector; > > > > + if (eoi_enabled(apic->vcpu)) { > > > > + vector = eoi_get_pending_vector(apic->vcpu); > > > > + if (vector != -1) > > > > + return vector; > > > > + } > > > > + return __apic_find_highest_isr(apic); > > > > +} > > > > > > Why aren't you modifying the ISR unconfitionally? > > > > ISR is not set if there won't be an EOI > > since it's EOI that normally clears it. > > > > > > + > > > > static void apic_update_ppr(struct kvm_lapic *apic) > > > > { > > > > u32 tpr, isrv, ppr, old_ppr; > > > > @@ -488,6 +553,8 @@ static void apic_set_eoi(struct kvm_lapic *apic) > > > > if (vector == -1) > > > > return; > > > > > > > > + if (eoi_enabled(apic->vcpu)) > > > > + eoi_clr_pending_vector(apic->vcpu); > > > > apic_clear_vector(vector, apic->regs + APIC_ISR); > > > > apic_update_ppr(apic); > > > > > > > > @@ -1236,11 +1303,25 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) > > > > { > > > > int vector = kvm_apic_has_interrupt(vcpu); > > > > struct kvm_lapic *apic = vcpu->arch.apic; > > > > + bool set_isr = true; > > > > > > > > if (vector == -1) > > > > return -1; > > > > > > > > - apic_set_vector(vector, apic->regs + APIC_ISR); > > > > + if (eoi_enabled(vcpu)) { > > > > + /* Anything pending? If yes disable eoi optimization. */ > > > > + if (unlikely(apic_find_highest_isr(apic) >= 0)) { > > > > + int v = eoi_clr_pending_vector(vcpu); > > > > > > ISR != pending, that's IRR. If ISR vector has lower priority than the > > > new vector, then we don't need to disable eoi avoidance. > > > > Yes. But we can and it's easier than figuring out priorities. > > I am guessing such collisions are rare, right? > > I'll add a trace to make sure. > > > > > > + if (v != -1) > > > > + apic_set_vector(v, apic->regs + APIC_ISR); > > > > + } else { > > > > + eoi_set_pending_vector(vcpu, vector); > > > > + set_isr = false; > > > > > > Weird. Just set it normally. Remember that reading the ISR needs to > > > return the correct value. > > > > Marcelo said linux does not normally read ISR - not true? > > Note this has no effect if the PV optimization is not enabled. > > > > > We need to process the avoided EOI before any APIC read/writes, to be > > > sure the guest sees the updated values. Same for IOAPIC, EOI affects > > > remote_irr. That may been we need to sample it after every exit, or > > > perhaps disable the feature for level-triggered interrupts. > > > > Disabling would be very sad. Can we sample on remote irr read? > > > Nothing sad about it, just correct. MS requires this to be disabled for > level triggered interrupts. We don't try to match what HV does 100% anyway. > We have to notify IOAPIC about EOI ASAP. It > may hold another interrupt for us that has to be delivered. You mean the ack notifiers? We can skip just for the vectors which have ack notifiers or only if there are no notifiers. > I was going to avoid most of the trickery in apic code and just check if > avoided EOI should be processed on each exit. This adds one if on exit > path instead of couple on interrupt injection path though. > > > > > + } > > > > + } > > > > + > > > > + if (set_isr) > > > > + apic_set_vector(vector, apic->regs + APIC_ISR); > > > > apic_update_ppr(apic); > > > > apic_clear_irr(vector, apic); > > > > return vector; > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > > > > > > -- > > > error compiling committee.c: too many arguments to function > > -- > > To unsubscribe from this list: send the line "unsubscribe kvm" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html