On Mon, Sep 10, 2012 at 04:09:15PM +0300, Gleb Natapov wrote: > Most interrupt are delivered to only one vcpu. Use pre-build tables to > find interrupt destination instead of looping through all vcpus. > > Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx> Looks good overall. I think I see some bugs, with rcu use and others. the most suspecious thing is that code seems to use rcu but there are no calls to rcu_dereference anywhere. Pls see below. Thanks for looking into this, once ready I intend to rework direct injection to work on top of this. > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 64adb61..121f308 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot { > struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; > }; > > +struct kvm_apic_map { > + struct rcu_head rcu; > + bool flat; > + u8 ldr_bits; > + struct kvm_lapic *phys_map[255]; Not 256? apic id is sometimes used in the lookup - is is guaranteed that guest can not set it to 0xff? If yes this will overflow. > + struct kvm_lapic *logical_map[16][16]; These are large arrays: 2Kbyte each one, which is bad for cache. Is it not better to have vcpu index stored there? that would reduce cache footprint by x8. > +}; > + > struct kvm_arch { > unsigned int n_used_mmu_pages; > unsigned int n_requested_mmu_pages; > @@ -528,6 +536,8 @@ struct kvm_arch { > struct kvm_ioapic *vioapic; > struct kvm_pit *vpit; > int vapics_in_nmi_mode; > + struct kvm_apic_map *apic_map; > + struct mutex apic_map_lock; > > unsigned int tss_addr; > struct page *apic_access_page; > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 07ad628..d18ddd2 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -139,11 +139,101 @@ static inline int apic_enabled(struct kvm_lapic *apic) > (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ > APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) > > +static inline int apic_x2apic_mode(struct kvm_lapic *apic) > +{ > + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; > +} > + > + two emoty lines not needed > static inline int kvm_apic_id(struct kvm_lapic *apic) > { > return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; > } > > +static void rcu_free_apic_map(struct rcu_head *head) > +{ > + struct kvm_apic_map *p = container_of(head, > + struct kvm_apic_map, rcu); why break this line? it is not too long as is. > + > + kfree(p); > +} > + > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits, > + u16 *cid, u16 *lid) > +{ > + if (ldr_bits == 32) { > + *cid = ldr >> 16; > + *lid = ldr & 0xffff; > + } else { > + ldr = GET_APIC_LOGICAL_ID(ldr); > + > + if (flat) { > + *cid = 0; > + *lid = ldr; > + } else { > + *cid = ldr >> 4; > + *lid = ldr & 0xf; > + } > + } > +} > + > +static inline int recalculate_apic_map(struct kvm *kvm) __must_check? > +{ > + struct kvm_apic_map *new, *old = NULL; > + struct kvm_vcpu *vcpu; > + int i; > + > + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); > + empty line not needed here :) > + if (!new) > + return -ENOMEM; > + > + new->ldr_bits = 8; > + new->flat = true; > + kvm_for_each_vcpu(i, vcpu, kvm) { > + u16 cid, lid; > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + if (!kvm_apic_present(vcpu)) > + continue; > + > + if (apic_x2apic_mode(apic)) { > + new->ldr_bits = 32; > + new->flat = false; > + } else if (kvm_apic_sw_enabled(apic) && new->flat && > + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) > + new->flat = false; > + > + new->phys_map[kvm_apic_id(apic)] = apic; > + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR), > + new->flat, new->ldr_bits, &cid, &lid); > + > + if (lid) > + new->logical_map[cid][ffs(lid) - 1] = apic; > + } > + mutex_lock(&kvm->arch.apic_map_lock); > + old = kvm->arch.apic_map; rcu_dereference_protected? > + rcu_assign_pointer(kvm->arch.apic_map, new); > + mutex_unlock(&kvm->arch.apic_map_lock); > + > + if (old) > + call_rcu(&old->rcu, rcu_free_apic_map); What guarantees rcu_free_apic_map is called before module goes away? I suspect we need at least rcu_barrier here: https://lwn.net/Articles/217484/ Also, this is done upon guest request? This allocates 2K and delays free until next rcu grace period. Guest can buffer up *a lot of* host kernel memory before this happens - looks like a DOS potential? Maybe simply synchronize_rcu here? Or if that looks too aggressive, adding this to some per-kvm pending list, and call rcu_barrier once that list is too large. > + > + return 0; > +} > + > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id) > +{ > + apic_set_reg(apic, APIC_ID, id << 24); > + return recalculate_apic_map(apic->vcpu->kvm); > +} > + > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) > +{ > + apic_set_reg(apic, APIC_LDR, id); > + return recalculate_apic_map(apic->vcpu->kvm); > +} > + return value of these functions seems never checked. > static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) > { > return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); > @@ -193,11 +283,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) > apic_set_reg(apic, APIC_LVR, v); > } > > -static inline int apic_x2apic_mode(struct kvm_lapic *apic) > -{ > - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; > -} > - > static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { > LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ > LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ > @@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, > return result; > } > > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, > + struct kvm_lapic_irq *irq, int *r) > +{ > + unsigned long bitmap = 1; > + struct kvm_lapic **dst; > + int i; > + > + if (irq->shorthand == APIC_DEST_SELF) { > + *r = kvm_apic_set_irq(src->vcpu, irq); > + return true; > + } > + > + if (irq->shorthand) > + return false; > + > + if (irq->dest_mode == 0) { /* physical mode */ > + if (irq->delivery_mode == APIC_DM_LOWEST || > + irq->dest_id == 0xff) > + return false; > + rcu_read_lock(); > + dst = &kvm->arch.apic_map rcu_dereference? Also, how do we know apic_map is not NULL at this point? >->phys_map[irq->dest_id & 0xff]; I went back and forth and I think I see what is going on here. bitmap is 1, this way for_each_set_bit below triggers with value 0. and we get &dst[0] which is same as dst. And read lock here is pared with unlock outside if. So it looks correct. But IMHO this is just trying too hard to reduce duplication, it confused at least this reader. Is this erquivalent to: rcu_read_lock(); dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff]; *r += kvm_apic_set_irq(dst->vcpu, irq); rcu_read_unlock(); return true; ? If yes would be much clearer this way. And the long code section below will also cleaner because nesting is more shallow. > + } else { > + u16 cid, lid; > + u32 mda = irq->dest_id; > + > + rcu_read_lock(); > + if (kvm->arch.apic_map->ldr_bits == 8) rcu_dereference? Also, how do we know apic_map is not NULL at this point? > + mda <<= 24; > + > + kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat, > + kvm->arch.apic_map->ldr_bits, &cid, &lid); > + dst = kvm->arch.apic_map->logical_map[cid]; > + > + bitmap = lid; > + if (irq->delivery_mode == APIC_DM_LOWEST) { > + int l = -1; > + for_each_set_bit(i, &bitmap, 16) { constant 16 repeated here and below. to make it a bit prettier: #define foreach_bit_in_apic_lid(bit, lid) for_each_set_bit(bit, &lid, 16) ? > + if (!dst[i]) > + continue; > + if (l < 0) > + l = i; > + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) > + l = i; > + } > + > + bitmap = (l >= 0) ? 1 << l : 0; We already know the destination here. Why encode it in bitmap and rescan below? Rewriting bitmap like this confused at least this reader. This will be equivalent, right? if (l >= 0) *r = kvm_apic_set_irq(dst[i]->vcpu, irq); goto unlock; If yes IMHO this is much more readable. > + } > + } > + > + for_each_set_bit(i, &bitmap, 16) { I note that *r is never set if bitmap is 0 (this became apparent after rewrite as suggested above). Looks like this will leak some kernel stack info to userspace. > + if (!dst[i]) > + continue; > + if (*r < 0) > + *r = 0; > + *r += kvm_apic_set_irq(dst[i]->vcpu, irq); > + } > + > + rcu_read_unlock(); > + return true; > +} > + > /* > * Add a pending IRQ into lapic. > * Return 1 if successfully added and 0 if discarded. > @@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) > switch (reg) { > case APIC_ID: /* Local APIC ID */ > if (!apic_x2apic_mode(apic)) > - apic_set_reg(apic, APIC_ID, val); > + kvm_apic_set_id(apic, val >> 24); return value discarded here. > else > ret = 1; > break; > @@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) > > case APIC_LDR: > if (!apic_x2apic_mode(apic)) > - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); > + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); and here > else > ret = 1; > break; > > case APIC_DFR: > - if (!apic_x2apic_mode(apic)) > + if (!apic_x2apic_mode(apic)) { > apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); > - else > + recalculate_apic_map(apic->vcpu->kvm); return value discarded here. > + } else > ret = 1; > break; > > @@ -1135,6 +1282,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) > static_key_slow_dec_deferred(&apic_hw_disabled); > else > static_key_slow_inc(&apic_hw_disabled.key); > + recalculate_apic_map(vcpu->kvm); and here > } > > if (!kvm_vcpu_is_bsp(apic->vcpu)) > @@ -1144,7 +1292,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) > if (apic_x2apic_mode(apic)) { > u32 id = kvm_apic_id(apic); > u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); > - apic_set_reg(apic, APIC_LDR, ldr); > + kvm_apic_set_ldr(apic, ldr); and here > } > apic->base_address = apic->vcpu->arch.apic_base & > MSR_IA32_APICBASE_BASE; > @@ -1169,7 +1317,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) > /* Stop the timer in case it's a reset to an active apic */ > hrtimer_cancel(&apic->lapic_timer.timer); > > - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); > + kvm_apic_set_id(apic, (u8)vcpu->vcpu_id); and here > kvm_apic_set_version(apic->vcpu); > > for (i = 0; i < APIC_LVT_NUM; i++) > @@ -1180,7 +1328,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) > apic_set_reg(apic, APIC_DFR, 0xffffffffU); > apic_set_spiv(apic, 0xff); > apic_set_reg(apic, APIC_TASKPRI, 0); > - apic_set_reg(apic, APIC_LDR, 0); > + kvm_apic_set_ldr(apic, 0); and here > apic_set_reg(apic, APIC_ESR, 0); > apic_set_reg(apic, APIC_ICR, 0); > apic_set_reg(apic, APIC_ICR2, 0); > @@ -1398,6 +1546,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, > /* set SPIV separately to get count of SW disabled APICs right */ > apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); > memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); > + /* call kvm_apic_set_id() to put apic into apic_map */ > + kvm_apic_set_id(apic, kvm_apic_id(apic)); and here > kvm_apic_set_version(vcpu); > > apic_update_ppr(apic); > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 615a8b0..5ce084e 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); > int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); > int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); > > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, > + struct kvm_lapic_irq *irq, int *r); > + > u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); > void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); > void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, > @@ -120,5 +123,4 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) > { > return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); > } > - > #endif > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index f91e2c9..db6db8f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -6269,6 +6269,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); > > raw_spin_lock_init(&kvm->arch.tsc_write_lock); > + mutex_init(&kvm->arch.apic_map_lock); > > return 0; > } > @@ -6319,6 +6320,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) > put_page(kvm->arch.apic_access_page); > if (kvm->arch.ept_identity_pagetable) > put_page(kvm->arch.ept_identity_pagetable); > + kfree(kvm->arch.apic_map); rcu_dereference_check(..., 1) since apic_map is around until vm is destroyed maybe really put it inline in arch? > } > > void kvm_arch_free_memslot(struct kvm_memory_slot *free, > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c > index 7118be0..3ca89c4 100644 > --- a/virt/kvm/irq_comm.c > +++ b/virt/kvm/irq_comm.c > @@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, > struct kvm_vcpu *vcpu, *lowest = NULL; > > if (irq->dest_mode == 0 && irq->dest_id == 0xff && > - kvm_is_dm_lowest_prio(irq)) > + kvm_is_dm_lowest_prio(irq)) { > printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); > + irq->delivery_mode = APIC_DM_FIXED; > + } > + > + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r)) > + return r; > > kvm_for_each_vcpu(i, vcpu, kvm) { > if (!kvm_apic_present(vcpu)) > -- > Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html