On 22.11.2012, at 10:28, Paul Mackerras wrote: > When we change or remove a HPT (hashed page table) entry, we can do > either a global TLB invalidation (tlbie) that works across the whole > machine, or a local invalidation (tlbiel) that only affects this core. > Currently we do local invalidations if the VM has only one vcpu or if > the guest requests it with the H_LOCAL flag, though the guest Linux > kernel currently doesn't ever use H_LOCAL. Then, to cope with the > possibility that vcpus moving around to different physical cores might > expose stale TLB entries, there is some code in kvmppc_hv_entry to > flush the whole TLB of entries for this VM if either this vcpu is now > running on a different physical core from where it last ran, or if this > physical core last ran a different vcpu. > > There are a number of problems on POWER7 with this as it stands: > > - The TLB invalidation is done per thread, whereas it only needs to be > done per core, since the TLB is shared between the threads. > - With the possibility of the host paging out guest pages, the use of > H_LOCAL by an SMP guest is dangerous since the guest could possibly > retain and use a stale TLB entry pointing to a page that had been > removed from the guest. I don't understand this part. Don't we flush the TLB when the page gets evicted from the shadow HTAB? Alex > - The TLB invalidations that we do when a vcpu moves from one physical > core to another are unnecessary in the case of an SMP guest that isn't > using H_LOCAL. > - The optimization of using local invalidations rather than global should > apply to guests with one virtual core, not just one vcpu. > > (None of this applies on PPC970, since there we always have to > invalidate the whole TLB when entering and leaving the guest, and we > can't support paging out guest memory.) > > To fix these problems and simplify the code, we now maintain a simple > cpumask of which cpus need to flush the TLB on entry to the guest. > (This is indexed by cpu, though we only ever use the bits for thread > 0 of each core.) Whenever we do a local TLB invalidation, we set the > bits for every cpu except the bit for thread 0 of the core that we're > currently running on. Whenever we enter a guest, we test and clear the > bit for our core, and flush the TLB if it was set. > > On initial startup of the VM, and when resetting the HPT, we set all the > bits in the need_tlb_flush cpumask, since any core could potentially have > stale TLB entries from the previous VM to use the same LPID, or the > previous contents of the HPT. > > Then, we maintain a count of the number of online virtual cores, and use > that when deciding whether to use a local invalidation rather than the > number of online vcpus. The code to make that decision is extracted out > into a new function, global_invalidates(). For multi-core guests on > POWER7 (i.e. when we are using mmu notifiers), we now never do local > invalidations regardless of the H_LOCAL flag. > > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_host.h | 5 +-- > arch/powerpc/kernel/asm-offsets.c | 4 +-- > arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 ++-- > arch/powerpc/kvm/book3s_hv.c | 9 ++++- > arch/powerpc/kvm/book3s_hv_rm_mmu.c | 37 +++++++++++++++++--- > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 56 ++++++++++++++----------------- > 6 files changed, 73 insertions(+), 45 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 58c7264..62fbd38 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -246,11 +246,12 @@ struct kvm_arch { > int using_mmu_notifiers; > u32 hpt_order; > atomic_t vcpus_running; > + u32 online_vcores; > unsigned long hpt_npte; > unsigned long hpt_mask; > atomic_t hpte_mod_interest; > spinlock_t slot_phys_lock; > - unsigned short last_vcpu[NR_CPUS]; > + cpumask_t need_tlb_flush; > struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; > struct kvmppc_linear_info *hpt_li; > #endif /* CONFIG_KVM_BOOK3S_64_HV */ > @@ -275,6 +276,7 @@ struct kvmppc_vcore { > int nap_count; > int napping_threads; > u16 pcpu; > + u16 last_cpu; > u8 vcore_state; > u8 in_guest; > struct list_head runnable_threads; > @@ -523,7 +525,6 @@ struct kvm_vcpu_arch { > u64 dec_jiffies; > u64 dec_expires; > unsigned long pending_exceptions; > - u16 last_cpu; > u8 ceded; > u8 prodded; > u32 last_inst; > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c > index 7523539..4e23ba2 100644 > --- a/arch/powerpc/kernel/asm-offsets.c > +++ b/arch/powerpc/kernel/asm-offsets.c > @@ -441,8 +441,7 @@ int main(void) > DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); > DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); > DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); > - DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); > - DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); > + DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); > DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); > DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); > DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); > @@ -470,7 +469,6 @@ int main(void) > DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); > DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); > DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); > - DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu)); > DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); > DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); > DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); > diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c > index 1029e22..2d61e01 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c > @@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) > * Reset all the reverse-mapping chains for all memslots > */ > kvmppc_rmap_reset(kvm); > - /* > - * Set the whole last_vcpu array to an invalid vcpu number. > - * This ensures that each vcpu will flush its TLB on next entry. > - */ > - memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); > + /* Ensure that each vcpu will flush its TLB on next entry. */ > + cpumask_setall(&kvm->arch.need_tlb_flush); > *htab_orderp = order; > err = 0; > } else { > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index a4f59db..ddbec60 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) > goto free_vcpu; > > vcpu->arch.shared = &vcpu->arch.shregs; > - vcpu->arch.last_cpu = -1; > vcpu->arch.mmcr[0] = MMCR0_FC; > vcpu->arch.ctrl = CTRL_RUNLATCH; > /* default to host PVR, since we can't spoof it */ > @@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) > vcore->preempt_tb = TB_NIL; > } > kvm->arch.vcores[core] = vcore; > + kvm->arch.online_vcores++; > } > mutex_unlock(&kvm->lock); > > @@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm) > return -ENOMEM; > kvm->arch.lpid = lpid; > > + /* > + * Since we don't flush the TLB when tearing down a VM, > + * and this lpid might have previously been used, > + * make sure we flush on each core before running the new VM. > + */ > + cpumask_setall(&kvm->arch.need_tlb_flush); > + > INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); > > kvm->arch.rma = NULL; > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > index fc3da32..7e1f7e2 100644 > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > @@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x) > return __va(addr); > } > > +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ > +static int global_invalidates(struct kvm *kvm, unsigned long flags) > +{ > + int global; > + > + /* > + * If there is only one vcore, and it's currently running, > + * we can use tlbiel as long as we mark all other physical > + * cores as potentially having stale TLB entries for this lpid. > + * If we're not using MMU notifiers, we never take pages away > + * from the guest, so we can use tlbiel if requested. > + * Otherwise, don't use tlbiel. > + */ > + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) > + global = 0; > + else if (kvm->arch.using_mmu_notifiers) > + global = 1; > + else > + global = !(flags & H_LOCAL); > + > + if (!global) { > + /* any other core might now have stale TLB entries... */ > + smp_wmb(); > + cpumask_setall(&kvm->arch.need_tlb_flush); > + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, > + &kvm->arch.need_tlb_flush); > + } > + > + return global; > +} > + > /* > * Add this HPTE into the chain for the real page. > * Must be called with the chain locked; it unlocks the chain. > @@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, > if (v & HPTE_V_VALID) { > hpte[0] &= ~HPTE_V_VALID; > rb = compute_tlbie_rb(v, hpte[1], pte_index); > - if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { > + if (global_invalidates(kvm, flags)) { > while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) > cpu_relax(); > asm volatile("ptesync" : : : "memory"); > @@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, > return H_NOT_FOUND; > } > > - if (atomic_read(&kvm->online_vcpus) == 1) > - flags |= H_LOCAL; > v = hpte[0]; > bits = (flags << 55) & HPTE_R_PP0; > bits |= (flags << 48) & HPTE_R_KEY_HI; > @@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, > if (v & HPTE_V_VALID) { > rb = compute_tlbie_rb(v, r, pte_index); > hpte[0] = v & ~HPTE_V_VALID; > - if (!(flags & H_LOCAL)) { > + if (global_invalidates(kvm, flags)) { > while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) > cpu_relax(); > asm volatile("ptesync" : : : "memory"); > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index b31fb4f..dbb5ced 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) > mtspr SPRN_SDR1,r6 /* switch to partition page table */ > mtspr SPRN_LPID,r7 > isync > + > + /* See if we need to flush the TLB */ > + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ > + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ > + srdi r6,r6,6 /* doubleword number */ > + sldi r6,r6,3 /* address offset */ > + add r6,r6,r9 > + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ > li r0,1 > + sld r0,r0,r7 > + ld r7,0(r6) > + and. r7,r7,r0 > + beq 22f > +23: ldarx r7,0,r6 /* if set, clear the bit */ > + andc r7,r7,r0 > + stdcx. r7,0,r6 > + bne 23b > + li r6,128 /* and flush the TLB */ > + mtctr r6 > + li r7,0x800 /* IS field = 0b10 */ > + ptesync > +28: tlbiel r7 > + addi r7,r7,0x1000 > + bdnz 28b > + ptesync > + > +22: li r0,1 > stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ > b 10f > > @@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) > mr r9,r4 > blt hdec_soon > > - /* > - * Invalidate the TLB if we could possibly have stale TLB > - * entries for this partition on this core due to the use > - * of tlbiel. > - * XXX maybe only need this on primary thread? > - */ > - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ > - lwz r5,VCPU_VCPUID(r4) > - lhz r6,PACAPACAINDEX(r13) > - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ > - lhz r8,VCPU_LAST_CPU(r4) > - sldi r7,r6,1 /* see if this is the same vcpu */ > - add r7,r7,r9 /* as last ran on this pcpu */ > - lhz r0,KVM_LAST_VCPU(r7) > - cmpw r6,r8 /* on the same cpu core as last time? */ > - bne 3f > - cmpw r0,r5 /* same vcpu as this core last ran? */ > - beq 1f > -3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ > - sth r5,KVM_LAST_VCPU(r7) > - li r6,128 > - mtctr r6 > - li r7,0x800 /* IS field = 0b10 */ > - ptesync > -2: tlbiel r7 > - addi r7,r7,0x1000 > - bdnz 2b > - ptesync > -1: > - > /* Save purr/spurr */ > mfspr r5,SPRN_PURR > mfspr r6,SPRN_SPURR > -- > 1.7.10.rc3.219.g53414 > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html