Re: [PATCH 3/5] KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations

Alexander Graf <agraf@xxxxxxx> · Fri, 23 Nov 2012 16:43:03 +0100

On 22.11.2012, at 10:28, Paul Mackerras wrote:

> When we change or remove a HPT (hashed page table) entry, we can do
> either a global TLB invalidation (tlbie) that works across the whole
> machine, or a local invalidation (tlbiel) that only affects this core.
> Currently we do local invalidations if the VM has only one vcpu or if
> the guest requests it with the H_LOCAL flag, though the guest Linux
> kernel currently doesn't ever use H_LOCAL.  Then, to cope with the
> possibility that vcpus moving around to different physical cores might
> expose stale TLB entries, there is some code in kvmppc_hv_entry to
> flush the whole TLB of entries for this VM if either this vcpu is now
> running on a different physical core from where it last ran, or if this
> physical core last ran a different vcpu.
> 
> There are a number of problems on POWER7 with this as it stands:
> 
> - The TLB invalidation is done per thread, whereas it only needs to be
>  done per core, since the TLB is shared between the threads.
> - With the possibility of the host paging out guest pages, the use of
>  H_LOCAL by an SMP guest is dangerous since the guest could possibly
>  retain and use a stale TLB entry pointing to a page that had been
>  removed from the guest.

I don't understand this part. Don't we flush the TLB when the page gets evicted from the shadow HTAB?

Alex

> - The TLB invalidations that we do when a vcpu moves from one physical
>  core to another are unnecessary in the case of an SMP guest that isn't
>  using H_LOCAL.
> - The optimization of using local invalidations rather than global should
>  apply to guests with one virtual core, not just one vcpu.
> 
> (None of this applies on PPC970, since there we always have to
> invalidate the whole TLB when entering and leaving the guest, and we
> can't support paging out guest memory.)
> 
> To fix these problems and simplify the code, we now maintain a simple
> cpumask of which cpus need to flush the TLB on entry to the guest.
> (This is indexed by cpu, though we only ever use the bits for thread
> 0 of each core.)  Whenever we do a local TLB invalidation, we set the
> bits for every cpu except the bit for thread 0 of the core that we're
> currently running on.  Whenever we enter a guest, we test and clear the
> bit for our core, and flush the TLB if it was set.
> 
> On initial startup of the VM, and when resetting the HPT, we set all the
> bits in the need_tlb_flush cpumask, since any core could potentially have
> stale TLB entries from the previous VM to use the same LPID, or the
> previous contents of the HPT.
> 
> Then, we maintain a count of the number of online virtual cores, and use
> that when deciding whether to use a local invalidation rather than the
> number of online vcpus.  The code to make that decision is extracted out
> into a new function, global_invalidates().  For multi-core guests on
> POWER7 (i.e. when we are using mmu notifiers), we now never do local
> invalidations regardless of the H_LOCAL flag.
> 
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx>
> ---
> arch/powerpc/include/asm/kvm_host.h     |    5 +--
> arch/powerpc/kernel/asm-offsets.c       |    4 +--
> arch/powerpc/kvm/book3s_64_mmu_hv.c     |    7 ++--
> arch/powerpc/kvm/book3s_hv.c            |    9 ++++-
> arch/powerpc/kvm/book3s_hv_rm_mmu.c     |   37 +++++++++++++++++---
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |   56 ++++++++++++++-----------------
> 6 files changed, 73 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 58c7264..62fbd38 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -246,11 +246,12 @@ struct kvm_arch {
> 	int using_mmu_notifiers;
> 	u32 hpt_order;
> 	atomic_t vcpus_running;
> +	u32 online_vcores;
> 	unsigned long hpt_npte;
> 	unsigned long hpt_mask;
> 	atomic_t hpte_mod_interest;
> 	spinlock_t slot_phys_lock;
> -	unsigned short last_vcpu[NR_CPUS];
> +	cpumask_t need_tlb_flush;
> 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> 	struct kvmppc_linear_info *hpt_li;
> #endif /* CONFIG_KVM_BOOK3S_64_HV */
> @@ -275,6 +276,7 @@ struct kvmppc_vcore {
> 	int nap_count;
> 	int napping_threads;
> 	u16 pcpu;
> +	u16 last_cpu;
> 	u8 vcore_state;
> 	u8 in_guest;
> 	struct list_head runnable_threads;
> @@ -523,7 +525,6 @@ struct kvm_vcpu_arch {
> 	u64 dec_jiffies;
> 	u64 dec_expires;
> 	unsigned long pending_exceptions;
> -	u16 last_cpu;
> 	u8 ceded;
> 	u8 prodded;
> 	u32 last_inst;
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 7523539..4e23ba2 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -441,8 +441,7 @@ int main(void)
> 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
> 	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
> 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
> -	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
> -	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
> +	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
> 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
> 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
> 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
> @@ -470,7 +469,6 @@ int main(void)
> 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
> 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
> -	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
> 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
> 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
> 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 1029e22..2d61e01 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
> 		 * Reset all the reverse-mapping chains for all memslots
> 		 */
> 		kvmppc_rmap_reset(kvm);
> -		/*
> -		 * Set the whole last_vcpu array to an invalid vcpu number.
> -		 * This ensures that each vcpu will flush its TLB on next entry.
> -		 */
> -		memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
> +		/* Ensure that each vcpu will flush its TLB on next entry. */
> +		cpumask_setall(&kvm->arch.need_tlb_flush);
> 		*htab_orderp = order;
> 		err = 0;
> 	} else {
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index a4f59db..ddbec60 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> 		goto free_vcpu;
> 
> 	vcpu->arch.shared = &vcpu->arch.shregs;
> -	vcpu->arch.last_cpu = -1;
> 	vcpu->arch.mmcr[0] = MMCR0_FC;
> 	vcpu->arch.ctrl = CTRL_RUNLATCH;
> 	/* default to host PVR, since we can't spoof it */
> @@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> 			vcore->preempt_tb = TB_NIL;
> 		}
> 		kvm->arch.vcores[core] = vcore;
> +		kvm->arch.online_vcores++;
> 	}
> 	mutex_unlock(&kvm->lock);
> 
> @@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
> 		return -ENOMEM;
> 	kvm->arch.lpid = lpid;
> 
> +	/*
> +	 * Since we don't flush the TLB when tearing down a VM,
> +	 * and this lpid might have previously been used,
> +	 * make sure we flush on each core before running the new VM.
> +	 */
> +	cpumask_setall(&kvm->arch.need_tlb_flush);
> +
> 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
> 
> 	kvm->arch.rma = NULL;
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> index fc3da32..7e1f7e2 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
> 	return __va(addr);
> }
> 
> +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
> +static int global_invalidates(struct kvm *kvm, unsigned long flags)
> +{
> +	int global;
> +
> +	/*
> +	 * If there is only one vcore, and it's currently running,
> +	 * we can use tlbiel as long as we mark all other physical
> +	 * cores as potentially having stale TLB entries for this lpid.
> +	 * If we're not using MMU notifiers, we never take pages away
> +	 * from the guest, so we can use tlbiel if requested.
> +	 * Otherwise, don't use tlbiel.
> +	 */
> +	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
> +		global = 0;
> +	else if (kvm->arch.using_mmu_notifiers)
> +		global = 1;
> +	else
> +		global = !(flags & H_LOCAL);
> +
> +	if (!global) {
> +		/* any other core might now have stale TLB entries... */
> +		smp_wmb();
> +		cpumask_setall(&kvm->arch.need_tlb_flush);
> +		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
> +				  &kvm->arch.need_tlb_flush);
> +	}
> +
> +	return global;
> +}
> +
> /*
>  * Add this HPTE into the chain for the real page.
>  * Must be called with the chain locked; it unlocks the chain.
> @@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
> 	if (v & HPTE_V_VALID) {
> 		hpte[0] &= ~HPTE_V_VALID;
> 		rb = compute_tlbie_rb(v, hpte[1], pte_index);
> -		if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
> +		if (global_invalidates(kvm, flags)) {
> 			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
> 				cpu_relax();
> 			asm volatile("ptesync" : : : "memory");
> @@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
> 		return H_NOT_FOUND;
> 	}
> 
> -	if (atomic_read(&kvm->online_vcpus) == 1)
> -		flags |= H_LOCAL;
> 	v = hpte[0];
> 	bits = (flags << 55) & HPTE_R_PP0;
> 	bits |= (flags << 48) & HPTE_R_KEY_HI;
> @@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
> 	if (v & HPTE_V_VALID) {
> 		rb = compute_tlbie_rb(v, r, pte_index);
> 		hpte[0] = v & ~HPTE_V_VALID;
> -		if (!(flags & H_LOCAL)) {
> +		if (global_invalidates(kvm, flags)) {
> 			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> 				cpu_relax();
> 			asm volatile("ptesync" : : : "memory");
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b31fb4f..dbb5ced 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
> 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> 	mtspr	SPRN_LPID,r7
> 	isync
> +
> +	/* See if we need to flush the TLB */
> +	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
> +	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
> +	srdi	r6,r6,6			/* doubleword number */
> +	sldi	r6,r6,3			/* address offset */
> +	add	r6,r6,r9
> +	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
> 	li	r0,1
> +	sld	r0,r0,r7
> +	ld	r7,0(r6)
> +	and.	r7,r7,r0
> +	beq	22f
> +23:	ldarx	r7,0,r6			/* if set, clear the bit */
> +	andc	r7,r7,r0
> +	stdcx.	r7,0,r6
> +	bne	23b
> +	li	r6,128			/* and flush the TLB */
> +	mtctr	r6
> +	li	r7,0x800		/* IS field = 0b10 */
> +	ptesync
> +28:	tlbiel	r7
> +	addi	r7,r7,0x1000
> +	bdnz	28b
> +	ptesync
> +
> +22:	li	r0,1
> 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
> 	b	10f
> 
> @@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
> 	mr	r9,r4
> 	blt	hdec_soon
> 
> -	/*
> -	 * Invalidate the TLB if we could possibly have stale TLB
> -	 * entries for this partition on this core due to the use
> -	 * of tlbiel.
> -	 * XXX maybe only need this on primary thread?
> -	 */
> -	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> -	lwz	r5,VCPU_VCPUID(r4)
> -	lhz	r6,PACAPACAINDEX(r13)
> -	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
> -	lhz	r8,VCPU_LAST_CPU(r4)
> -	sldi	r7,r6,1			/* see if this is the same vcpu */
> -	add	r7,r7,r9		/* as last ran on this pcpu */
> -	lhz	r0,KVM_LAST_VCPU(r7)
> -	cmpw	r6,r8			/* on the same cpu core as last time? */
> -	bne	3f
> -	cmpw	r0,r5			/* same vcpu as this core last ran? */
> -	beq	1f
> -3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
> -	sth	r5,KVM_LAST_VCPU(r7)
> -	li	r6,128
> -	mtctr	r6
> -	li	r7,0x800		/* IS field = 0b10 */
> -	ptesync
> -2:	tlbiel	r7
> -	addi	r7,r7,0x1000
> -	bdnz	2b
> -	ptesync
> -1:
> -
> 	/* Save purr/spurr */
> 	mfspr	r5,SPRN_PURR
> 	mfspr	r6,SPRN_SPURR
> -- 
> 1.7.10.rc3.219.g53414
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html