>> Currently, we call ioapic_service() immediately when we find the irq is still >> active during eoi broadcast. But for real hardware, there's some dealy between >> the EOI writing and irq delivery (system bus latency?). So we need to emulate >> this behavior. Otherwise, for a guest who haven't register a proper irq handler >> , it would stay in the interrupt routine as this irq would be re-injected >> immediately after guest enables interrupt. This would lead guest can't move >> forward and may miss the possibility to get proper irq handler registered (one >> example is windows guest resuming from hibernation). >> >> As there's no way to differ the unhandled irq from new raised ones, this patch >> solve this problems by scheduling a delayed work when the count of irq injected >> during eoi broadcast exceeds a threshold value. After this patch, the guest can >> move a little forward when there's no suitable irq handler in case it may >> register one very soon and for guest who has a bad irq detection routine ( such >> as note_interrupt() in linux ), this bad irq would be recognized soon as in the >> past. >> >> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> >> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> >> Signed-off-by: Zhang Haoyu <zhanghy@xxxxxxxxxxx> >> --- >> include/trace/events/kvm.h | 20 ++++++++++++++++++ >> virt/kvm/ioapic.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- >> virt/kvm/ioapic.h | 6 ++++++ >> 3 files changed, 75 insertions(+), 2 deletions(-) >> >> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h >> index 908925a..b05f688 100644 >> --- a/include/trace/events/kvm.h >> +++ b/include/trace/events/kvm.h >> @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq, >> __entry->coalesced ? " (coalesced)" : "") >> ); >> >> +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, >> + TP_PROTO(__u64 e), >> + TP_ARGS(e), >> + >> + TP_STRUCT__entry( >> + __field( __u64, e ) >> + ), >> + >> + TP_fast_assign( >> + __entry->e = e; >> + ), >> + >> + TP_printk("dst %x vec=%u (%s|%s|%s%s)", >> + (u8)(__entry->e >> 56), (u8)__entry->e, >> + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), >> + (__entry->e & (1<<11)) ? "logical" : "physical", >> + (__entry->e & (1<<15)) ? "level" : "edge", >> + (__entry->e & (1<<16)) ? "|masked" : "") >> +); >> + >> TRACE_EVENT(kvm_msi_set_irq, >> TP_PROTO(__u64 address, __u64 data), >> TP_ARGS(address, data), >> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c >> index e8ce34c..a36c4c4 100644 >> --- a/virt/kvm/ioapic.c >> +++ b/virt/kvm/ioapic.c >> @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) >> spin_unlock(&ioapic->lock); >> } >> >> +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) >> +{ >> + int i; >> + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, >> + eoi_inject.work); >> + spin_lock(&ioapic->lock); >> + for (i = 0; i < IOAPIC_NUM_PINS; i++) { >> + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; >> + >> + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) >> + continue; >> + >> + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) >> + ioapic_service(ioapic, i, false); >> + } >> + spin_unlock(&ioapic->lock); >> +} >> + >> static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, >> struct kvm_ioapic *ioapic, int vector, int trigger_mode) >> { >> @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, >> >> ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); >> ent->fields.remote_irr = 0; >> - if (ioapic->irr & (1 << i)) >> - ioapic_service(ioapic, i, false); >> + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { > >The mask check is new - why now? You don't check it in the work handler >as well. > The mask check is to avoid incrementing ioapic->irq_eoi[i] when this irq is masked, the count should be zeroed, but needless to check it in the work handler, the check will be performed in ioapic_service(). >> + ++ioapic->irq_eoi[i]; >> + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { >> + /* >> + * Real hardware does not deliver the irq so >> + * immediately during eoi broadcast, so we need >> + * to emulate this behavior. Otherwise, for >> + * guests who has not registered handler of a >> + * level irq, this irq would be injected >> + * immediately after guest enables interrupt >> + * (which happens usually at the end of the >> + * common interrupt routine). This would lead >> + * guest can't move forward and may miss the >> + * possibility to get proper irq handler >> + * registered. So we need to give some breath to >> + * guest. TODO: 1 is too long? > >TODO should be clarify. Maybe also translate the "1" into something >HZ-derived. > I will delete the TODO comment, now the risk of "1 is too long" can be ignored, because "if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT)" cannot be true except interrupt storm happened from the the angle of probability. >> + */ >> + schedule_delayed_work(&ioapic->eoi_inject, 1); >> + ioapic->irq_eoi[i] = 0; >> + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); >> + } >> + else { > >I think it's the desired kernel style to do "} else {" here, but I'm not >totally sure if that rule exists. > I read CodingStyle, you are right. >> + ioapic_service(ioapic, i, false); >> + } >> + else { > >I strongly suspect this version of the patch wasn't built... ;) > I backported it on my host linux-3.10, and built it okay. And this time I build it on upstream, failed, no matched "}" for if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT). So sorry for my careless. I will post a new patch. >> + ioapic->irq_eoi[i] = 0; >> + } >> } >> } >> >> @@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) >> { >> int i; >> >> + cancel_delayed_work_sync(&ioapic->eoi_inject); >> for (i = 0; i < IOAPIC_NUM_PINS; i++) >> ioapic->redirtbl[i].fields.mask = 1; >> ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; >> ioapic->ioregsel = 0; >> ioapic->irr = 0; >> ioapic->id = 0; >> + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); >> rtc_irq_eoi_tracking_reset(ioapic); >> update_handled_vectors(ioapic); >> } >> @@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm) >> if (!ioapic) >> return -ENOMEM; >> spin_lock_init(&ioapic->lock); >> + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); >> kvm->arch.vioapic = ioapic; >> kvm_ioapic_reset(ioapic); >> kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); >> @@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) >> { >> struct kvm_ioapic *ioapic = kvm->arch.vioapic; >> >> + cancel_delayed_work_sync(&ioapic->eoi_inject); >> if (ioapic) { >> kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); >> kvm->arch.vioapic = NULL; >> diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h >> index 90d43e9..260ac0a 100644 >> --- a/virt/kvm/ioapic.h >> +++ b/virt/kvm/ioapic.h >> @@ -34,6 +34,10 @@ struct kvm_vcpu; >> #define IOAPIC_INIT 0x5 >> #define IOAPIC_EXTINT 0x7 >> >> +/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached, >> + which can be considered as interrupt storm happened */ >> +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 >> + >> #ifdef CONFIG_X86 >> #define RTC_GSI 8 >> #else >> @@ -59,6 +63,8 @@ struct kvm_ioapic { >> spinlock_t lock; >> DECLARE_BITMAP(handled_vectors, 256); >> struct rtc_status rtc_status; >> + struct delayed_work eoi_inject; >> + u32 irq_eoi[IOAPIC_NUM_PINS]; >> }; >> >> #ifdef DEBUG >> > >The general approach looks sane to me. I was first concerned the delay >could affect non-broken guests as well, but the loop counter prevents >this effectively. > >Jan -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html