Processors that implement ISA v3.0 or later don't necessarily have threads in a core sharing all translations, and/or TLBIEL does not necessarily invalidate translations on all other threads (the architecture talks only about the effect on translations for "the thread executing the tlbiel instruction". While this worked for POWER9, it may not for future implementations, so remove it. A POWER9 specific optimisation would have to have a specific CPU feature to check, if it were to be re-added. Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx> --- arch/powerpc/kvm/book3s_hv.c | 32 +++++++++++++++++++--------- arch/powerpc/kvm/book3s_hv_builtin.c | 9 -------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 6 ------ 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2d8627dbd9f6..752daf43f780 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2588,22 +2588,34 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { struct kvm_nested_guest *nested = vcpu->arch.nested; cpumask_t *cpu_in_guest; + cpumask_t *need_tlb_flush; int i; - cpu = cpu_first_thread_sibling(cpu); if (nested) { - cpumask_set_cpu(cpu, &nested->need_tlb_flush); + need_tlb_flush = &nested->need_tlb_flush; cpu_in_guest = &nested->cpu_in_guest; } else { - cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + need_tlb_flush = &kvm->arch.need_tlb_flush; cpu_in_guest = &kvm->arch.cpu_in_guest; } + + cpu = cpu_first_thread_sibling(cpu); + for (i = 0; i < threads_per_core; ++i) + cpumask_set_cpu(cpu + i, need_tlb_flush); + /* - * Make sure setting of bit in need_tlb_flush precedes + * Make sure setting of bits in need_tlb_flush precedes * testing of cpu_in_guest bits. The matching barrier on * the other side is the first smp_mb() in kvmppc_run_core(). */ smp_mb(); + + /* + * Pull vcpus out of guests if necessary, such that they'll notice + * the need_tlb_flush bit when they re-enter the guest. If this was + * ever a performance concern, it would be interesting to compare + * with performance of using TLBIE. + */ for (i = 0; i < threads_per_core; ++i) if (cpumask_test_cpu(cpu + i, cpu_in_guest)) smp_call_function_single(cpu + i, do_nothing, NULL, 1); @@ -2632,18 +2644,18 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) * can move around between pcpus. To cope with this, when * a vcpu moves from one pcpu to another, we need to tell * any vcpus running on the same core as this vcpu previously - * ran to flush the TLB. The TLB is shared between threads, - * so we use a single bit in .need_tlb_flush for all 4 threads. + * ran to flush the TLB. */ if (prev_cpu != pcpu) { - if (prev_cpu >= 0 && - cpu_first_thread_sibling(prev_cpu) != - cpu_first_thread_sibling(pcpu)) - radix_flush_cpu(kvm, prev_cpu, vcpu); if (nested) nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu; else vcpu->arch.prev_cpu = pcpu; + + if (prev_cpu < 0) + return; /* first run */ + + radix_flush_cpu(kvm, prev_cpu, vcpu); } } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index f3d3183249fe..dad118760a4e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -789,15 +789,6 @@ void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu, { cpumask_t *need_tlb_flush; - /* - * On POWER9, individual threads can come in here, but the - * TLB is shared between the 4 threads in a core, hence - * invalidating on one thread invalidates for all. - * Thus we make all 4 threads use the same bit. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - pcpu = cpu_first_thread_sibling(pcpu); - if (nested) need_tlb_flush = &nested->need_tlb_flush; else diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 88da2764c1bb..f87237927096 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -62,12 +62,6 @@ static int global_invalidates(struct kvm *kvm) smp_wmb(); cpumask_setall(&kvm->arch.need_tlb_flush); cpu = local_paca->kvm_hstate.kvm_vcore->pcpu; - /* - * On POWER9, threads are independent but the TLB is shared, - * so use the bit for the first thread to represent the core. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - cpu = cpu_first_thread_sibling(cpu); cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush); } -- 2.23.0