On Tuesday 03 March 2009 19:20:37 Marcelo Tosatti wrote: > On Mon, Mar 02, 2009 at 04:19:33PM +0800, Sheng Yang wrote: > > Gleb fixed bitmap ops usage in kvm_ioapic_get_delivery_bitmask. > > > > Sheng merged two functions, as well as fix several issues in > > kvm_get_intr_delivery_bitmask: > > 1. deliver_bitmask is a bitmap rather than a unsigned long intereger. > > 2. Lowest priority target bitmap wrong calculated by mistake. > > 3. Prevent potential NULL reference. > > 4. Declaration in include/kvm_host.h caused powerpc compilation warning. > > > > Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx> > > --- > > include/linux/kvm_host.h | 3 --- > > virt/kvm/ioapic.c | 39 --------------------------------------- > > virt/kvm/ioapic.h | 5 +++-- > > virt/kvm/irq_comm.c | 42 > > ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 41 > > insertions(+), 48 deletions(-) > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 3832243..fb60f31 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -363,9 +363,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm > > *kvm, int irq, struct kvm_irq_mask_notifier *kimn); > > void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); > > > > -void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, > > - union kvm_ioapic_redirect_entry *entry, > > - unsigned long *deliver_bitmask); > > int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); > > void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned > > pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, > > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c > > index 7c2cb2b..2c40ff8 100644 > > --- a/virt/kvm/ioapic.c > > +++ b/virt/kvm/ioapic.c > > @@ -161,45 +161,6 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) > > kvm_vcpu_kick(vcpu); > > } > > > > -void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, > > - u8 dest_mode, unsigned long *mask) > > -{ > > - int i; > > - struct kvm *kvm = ioapic->kvm; > > - struct kvm_vcpu *vcpu; > > - > > - ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); > > - > > - *mask = 0; > > - if (dest_mode == 0) { /* Physical mode. */ > > - if (dest == 0xFF) { /* Broadcast. */ > > - for (i = 0; i < KVM_MAX_VCPUS; ++i) > > - if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) > > - *mask |= 1 << i; > > - return; > > - } > > - for (i = 0; i < KVM_MAX_VCPUS; ++i) { > > - vcpu = kvm->vcpus[i]; > > - if (!vcpu) > > - continue; > > - if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { > > - if (vcpu->arch.apic) > > - *mask = 1 << i; > > - break; > > - } > > - } > > - } else if (dest != 0) /* Logical mode, MDA non-zero. */ > > - for (i = 0; i < KVM_MAX_VCPUS; ++i) { > > - vcpu = kvm->vcpus[i]; > > - if (!vcpu) > > - continue; > > - if (vcpu->arch.apic && > > - kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) > > - *mask |= 1 << vcpu->vcpu_id; > > - } > > - ioapic_debug("mask %x\n", *mask); > > -} > > - > > static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) > > { > > union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq]; > > diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h > > index 7275f87..c8032ab 100644 > > --- a/virt/kvm/ioapic.h > > +++ b/virt/kvm/ioapic.h > > @@ -70,7 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, > > int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); > > int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); > > void kvm_ioapic_reset(struct kvm_ioapic *ioapic); > > -void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, > > - u8 dest_mode, unsigned long *mask); > > +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, > > + union kvm_ioapic_redirect_entry *entry, > > + unsigned long *deliver_bitmask); > > > > #endif > > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c > > index d165e05..bb6c5d0 100644 > > --- a/virt/kvm/irq_comm.c > > +++ b/virt/kvm/irq_comm.c > > @@ -47,15 +47,49 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic > > *ioapic, union kvm_ioapic_redirect_entry *entry, > > unsigned long *deliver_bitmask) > > { > > + int i; > > + struct kvm *kvm = ioapic->kvm; > > struct kvm_vcpu *vcpu; > > > > - kvm_ioapic_get_delivery_bitmask(ioapic, entry->fields.dest_id, > > - entry->fields.dest_mode, > > - deliver_bitmask); > > + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); > > I think you can drop the zeroing from the callers? OK... > > + > > + if (entry->fields.dest_mode == 0) { /* Physical mode. */ > > + if (entry->fields.dest_id == 0xFF) { /* Broadcast. */ > > + for (i = 0; i < KVM_MAX_VCPUS; ++i) > > + if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) > > + __set_bit(i, deliver_bitmask); > > + return; > > + } > > The spec says broadcast is not supported with lowest priority delivery > mode, and that "must not be configured by software". I wonder what happens > in HW if you do that. > Um.. So you mean to prohibit this kind of action? OK. > > + for (i = 0; i < KVM_MAX_VCPUS; ++i) { > > + vcpu = kvm->vcpus[i]; > > + if (!vcpu) > > + continue; > > + if (kvm_apic_match_physical_addr(vcpu->arch.apic, > > + entry->fields.dest_id)) { > > + if (vcpu->arch.apic) > > + __set_bit(i, deliver_bitmask); > > + break; > > + } > > + } > > + } else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. > > */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) { > > + vcpu = kvm->vcpus[i]; > > + if (!vcpu) > > + continue; > > + if (vcpu->arch.apic && > > + kvm_apic_match_logical_addr(vcpu->arch.apic, > > + entry->fields.dest_id)) > > + __set_bit(i, deliver_bitmask); > > + } > > + > > switch (entry->fields.delivery_mode) { > > case IOAPIC_LOWEST_PRIORITY: > > + /* Select one in deliver_bitmask */ > > vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, > > entry->fields.vector, deliver_bitmask); > > + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); > > + if (!vcpu) > > + return; > > __set_bit(vcpu->vcpu_id, deliver_bitmask); > > break; > > case IOAPIC_FIXED: > > @@ -65,7 +99,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic > > *ioapic, if (printk_ratelimit()) > > printk(KERN_INFO "kvm: unsupported delivery mode %d\n", > > entry->fields.delivery_mode); > > - *deliver_bitmask = 0; > > + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); > > } > > } > > Unrelated question, what was the issue (in detail) which caused > this change again: I have dig out my old post (I thought you are also involved :) ) >[kvm-devel] The SMP RHEL 5.1 PAE guest can't boot up issue >From: >"Yang, Sheng" <sheng.yang@xxxxxxxxx> (Intel) > To: >kvm-devel@xxxxxxxxxxxxxxxxxxxxx > Date: >2008-02-22 16:57 >I believe I have found the root cause of SMP RHEL5.1 PAE guest can't boot up >issue. The problem was caused by >kvm:6685637b211ad67bdce21bfd9f91bc888b3acb4f >"KVM: VMX: Ensure vcpu time stamp counter is monotonous" (It didn't take me >much time to found the solution, but a lot of time to find the proper >explanation... :( ) > >As we guessed, the problem was the monotonous of TSC. I have traced to >the 2.6.18 PAE guest kernel, and finally found it caused by a overflow in > the loop of function update_wall_timer()(kernel/timer.c), when using TSC as > clocksource by default. > >The reason is that the patch "KVM: VMX: Ensure vcpu time stamp counter is >monotonous" bring big gap between different VCPUs (error between >TSC_OFFSETs). Though I have proved that the patch can ensure the monotonous >on each VCPU (which rejected my first thought...), the patch >have 2 problems: > >1. It have accumulated the error. Each vcpu's TSC is monotonous, but get >slower and slower, compared to the host. That's because the TSC is very >accuracy and the interval between reading TSC is big. But this is not very >critical. > >2. The critical one. In normal condition, VCPU0 migrated much more >frequently than other VCPUs. And the patch add more "delta" (always negative >if host TSC is stable) to TSC_OFFSET each >time migrated. Then after boot for a while, VCPU0 became much >slower than others (In my test, VCPU0 was migrated about two times than the >others, and easily to be more than 100k cycles slower). In the guest kernel, >clocksource TSC is global variable, the variable "cycle_last" may got the >VCPU1's TSC value, then turn to VCPU0. For VCPU0's TSC_OFFSET is >smaller than VCPU1's, so it's possible to got the "cycle_last" (from VCPU1) >bigger than current TSC value (from VCPU0) in next tick. Then "u64 offset = >clocksource_read() - cycle_last" overflowed and caused the "infinite" loop. >And it can also explained why Marcelo's patch don't work - it just reduce > the rate of gap increasing. > >The freezing didn't happen when using userspace IOAPIC, just because the > qemu APIC didn't implement real LOWPRI(or round_robin) to choose CPU for > delivery. It choose VCPU0 everytime if possible, so CPU1 in guest won't > update cycle_last. :( > >This freezing only occurred on RHEL5/5.1 pae (kernel 2.6.18), because of > they set IO-APIC IRQ0's dest_mask to 0x3 (with 2 vcpus) and dest_mode as > LOWEST_PRIOITY, then other vcpus had chance to modify "cycle_last". In > contrast, RHEL5/5.1 32e set IRQ0's dest_mode as FIXED, to CPU0, then don't > have this problem. So does RHEL4(kernel 2.6.9). > >I don't know if the patch was still needed now, since it was posted long > ago(I don't know which issue it solved). I'd like to post a revert patch if > necessary. -- regards Yang, Sheng -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html