The tlbie instruction is used to invalidate caching of translation information derived from the partition and process scoped page tables. This instruction is hypervisor privileged when specifying partition scoped translations to be invalidated. Thus this interface is paravirtualised with the H_TLB_INVALIDATE hcall which is used by a pseries hypervisor to perform partition scoped invalidations. This is then handled in the hypervisor in kvmhv_emulate_priv_tlbie(). When handling this hcall in the hypervisor it is necessary to invalidate caching of partition scoped translations which are in the shadow page table (radix) or shadow hpt (hash page table). This functionality already exists for radix, so implement it for a hash guest. LPID wide invalidations are already handled and don't differ from the radix case. However the case where a single tlb entry corresponding to a virtual address is being invalidated needs to be implemented here. The information to find the entry is provided as an hcall argument and is expected to match the layout of rb specified for the tlbie instruction. The rb value provides an abbreviated virtual address, base and actual page size and segment size to be used to search for the corresponding tlb entry. A hash is computed and this is used to find the pteg which needs to be searched. However since only 64 bits of the virtual address are supplied and depending on the segment and hpt size up to all 78 bits of the virtual address may be needed to compute the hash we have to mask out the bits which can't be determined and iterate through all possible combinations looking for a matching entry in all of the ptegs which are addressed by all of the possible hash values. Although there is in theory a 1-to-1 relationship between ptes in the shadow hpt and the hpt maintained by the guest hypervisor we need to invalidate all matches since they can't be differentiated. When a matching entry is found it is invalidated if it was valid and the corresponding tlbie is issued by the hypervisor, irrespective the pte is then zeroed. An optimisation here would be to just make the pte absent and extend the rev map to store the host real doubleword since it is still valid. Note: ric == 3 (cluster bombs) are not supported even though the ISA technically allows for them, their encoding is implemenatation dependant and linux doesn't use them. Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx> --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 11 + arch/powerpc/kvm/book3s_hv_nested.c | 293 ++++++++++++++++++++++---- 2 files changed, 258 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index f33dcb84a0bf..70f4545fecbb 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -129,6 +129,17 @@ #define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */ #define TLBIEL_INVAL_SET_SHIFT 12 +/* Fields in the rb registers for the tlbie instruction */ +#define TLBIE_RB_AVA_4K ASM_CONST(0xfffffffffffff000) +#define TLBIE_RB_AVA_L ASM_CONST(0xfffffffffff00000) +#define TLBIE_RB_LP ASM_CONST(0x00000000000ff000) +#define TLBIE_RB_B ASM_CONST(0x0000000000000300) +#define TLBIE_RB_B_1T ASM_CONST(0x0000000000000100) +#define TLBIE_RB_B_SHIFT 50 /* Shift to match the pte location */ +#define TLBIE_RB_AVAL ASM_CONST(0x00000000000000fe) +#define TLBIE_RB_AVAL_SHIFT 12 +#define TLBIE_RB_L ASM_CONST(0x0000000000000001) + #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 463745e535c5..57add167115e 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -931,10 +931,10 @@ void kvmhv_release_all_nested(struct kvm *kvm) } /* caller must hold gp->tlb_lock */ -static void kvmhv_flush_nested(struct kvm_nested_guest *gp) +static void kvmhv_flush_nested(struct kvm *kvm, struct kvm_nested_guest *gp, + bool invalidate_ptbl) { - struct kvm *kvm = gp->l1_host; - + /* Invalidate (zero) all entries in the shadow pgtable or shadow hpt */ spin_lock(&kvm->mmu_lock); if (gp->radix) { kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, @@ -947,10 +947,15 @@ static void kvmhv_flush_nested(struct kvm_nested_guest *gp) sizeof(struct revmap_entry)); } spin_unlock(&kvm->mmu_lock); + /* remove all nested rmap entries and perform global invalidation */ + kvmhv_remove_all_nested_rmap_lpid(kvm, gp->l1_lpid); kvmhv_flush_lpid(gp->shadow_lpid, gp->radix); - kvmhv_update_ptbl_cache(gp); - if (gp->l1_gr_to_hr == 0) - kvmhv_remove_nested(gp); + /* was caching of the partition table entries also invalidated? */ + if (invalidate_ptbl) { + kvmhv_update_ptbl_cache(gp); + if (gp->l1_gr_to_hr == 0) + kvmhv_remove_nested(gp); + } } struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, @@ -1296,6 +1301,158 @@ static bool kvmhv_invalidate_shadow_pte_radix(struct kvm_vcpu *vcpu, return ret; } +/* Called with the hpte locked */ +static void kvmhv_invalidate_shadow_pte_hash(struct kvm_hpt_info *hpt, + unsigned int lpid, __be64 *hptep, + unsigned long index) +{ + hpt->rev[index].guest_rpte = 0UL; + if (hptep[0] & cpu_to_be64(HPTE_V_VALID)) { + /* HPTE was previously valid, so we need to invalidate it */ + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(lpid, hptep, index); + } + hptep[1] = 0ULL; + eieio(); + __unlock_hpte(hptep, 0UL); +} + +/* Calculate hash given a virtual address, base page shift, and segment size */ +static unsigned long kvmppc_hv_get_hash_value_va(struct kvm_hpt_info *hpt, + unsigned long va, int pshift, + unsigned long b) +{ + unsigned long hash, somask; + + if (b & HPTE_R_B_1T) { /* 1T segment */ + somask = (1UL << 40) - 1; + hash = va >> 40; + hash ^= hash << 25; + } else { /* 256M segment */ + somask = (1UL << 28) - 1; + hash = va >> 28; + } + hash ^= ((va & somask) >> pshift); + hash &= kvmppc_hpt_mask(hpt); + + return hash; +} + +/* called with gp->tlb_lock held */ +static void kvmhv_tlbie_hpt_addr(struct kvm_nested_guest *gp, unsigned long va, + int base_pshift, int actual_pshift, + unsigned long b) +{ + unsigned long mask, hash_incr, num, i; + struct kvm_hpt_info *hpt = &gp->shadow_hpt; + __be64 *hptep; + unsigned long hash, v, v_mask, v_match, r, r_mask, r_match; + + hash = kvmppc_hv_get_hash_value_va(hpt, va, base_pshift, b); + + /* + * The virtual address provided to us in the rb register for tlbie is + * bits 14:77 of the virtual address, however we support a 68 bit + * virtual address on P9. This means that we actually need bits 10:77 of + * the virtual address to calculate all possible hash values for a 68 + * bit virtual address space. This means that dependant on the size of + * the hpt (and thus the number of hash bits we actually use to find + * the pteg index) we might have to search up to 16 ptegs (1TB segs) or + * 8 ptegs (256M segs) for a match. + */ + if (b & HPTE_R_B_1T) { /* 1T segment */ + /* + * The hash when using 1T segments uses bits 0:37 of the VA. + * Thus to cover the missing bits of the VA (bits 0:13) we need + * to zero any of these bits being used (as determined by + * kvmppc_hpt_mask()) and then search all possible values. + */ + hash_incr = 1UL << 24; + mask = (0x3ffUL << 24) & kvmppc_hpt_mask(hpt); + hash &= ~mask; + num = mask >> 24; + } else { /* 256M segment */ + /* + * The hash when using 256M segments uses bits 11:49 of the VA. + * Thus to cover the missing bits of the VA (bits 11:13) we need + * to zero any of these bits being used (as determined by + * kvmppc_hpt_mask()) and then search all possible values. + */ + hash_incr = 1UL << 36; + mask = (0x7UL << 36) & kvmppc_hpt_mask(hpt); + hash &= ~mask; + num = mask >> 36; + } + + /* Calculate what we're going to match the hpte on */ + v_match = va >> 16; /* Align va to ava in the hpte */ + if (base_pshift >= 24) + v_match &= ~((1UL << (base_pshift - 16)) - 1); + else + v_match &= ~0x7fUL; + if (actual_pshift > 12) + v_match |= HPTE_V_LARGE; + r_match = b; + /* We don't have the top 4 bits of the ava to match on */ + v_mask = (TLBIE_RB_AVA_4K >> 16) & HPTE_V_AVPN_3_0; + v_mask |= HPTE_V_LARGE | HPTE_V_SECONDARY; + r_mask = HPTE_R_B; + + /* Iterate through the ptegs which we have to search */ + for (i = 0; i <= num; i++, hash += hash_incr) { + unsigned long pteg_addr = hash << 7; + v_match &= ~HPTE_V_SECONDARY; + + /* Try both the primary and the secondary hash */ + while (true) { + int j; + hptep = (__be64 *)(hpt->virt + pteg_addr); + + /* There are 8 entries in the pteg to search */ + for (j = 0; j < 16; j += 2) { + preempt_disable(); + /* Lock the pte */ + while (!try_lock_hpte(&hptep[j], HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptep[j]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hptep[j + 1]); + + /* + * Check for a match under the lock + * NOTE: the entry might be valid or absent + */ + if ((v & (HPTE_V_VALID | HPTE_V_ABSENT)) && + !((v ^ v_match) & v_mask) && + !((r ^ r_match) & r_mask) && + (kvmppc_hpte_base_page_shift(v, r) == + base_pshift) && + (kvmppc_hpte_actual_page_shift(v, r) == + actual_pshift)) + kvmhv_invalidate_shadow_pte_hash(hpt, + gp->shadow_lpid, &hptep[j], + (pteg_addr >> 4) + (j >> 1)); + else + __unlock_hpte(&hptep[j], v); + preempt_enable(); + /* + * In theory there is a 1-to-1 mapping between + * entries in the L1 hpt and our shadow hpt, + * however since L1 can't exactly specify a + * hpte (since we're missing some va bits) we + * must invalidate any match which we find and + * continue the search. + */ + } + + if (v_match & HPTE_V_SECONDARY) + break; + /* try the secondary hash */ + v_match |= HPTE_V_SECONDARY; + pteg_addr ^= (kvmppc_hpt_mask(hpt) << 7); + } + } +} + static inline int get_ric(unsigned int instr) { return (instr >> 18) & 0x3; @@ -1331,44 +1488,82 @@ static inline long get_epn(unsigned long r_val) return r_val >> 12; } +/* SLB[lp] encodings for base page shifts */ +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid, - int ap, long epn) + bool radix, unsigned long rbval) { struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *gp; - long npages; - int shift, shadow_shift; - unsigned long addr; int rc = 0; - shift = ap_to_shift(ap); - addr = epn << 12; - if (shift < 0) - /* Invalid ap encoding */ - return -EINVAL; - - addr &= ~((1UL << shift) - 1); - npages = 1UL << (shift - PAGE_SHIFT); - gp = kvmhv_get_nested(kvm, lpid, false); if (!gp) /* No such guest -> nothing to do */ return 0; mutex_lock(&gp->tlb_lock); - /* XXX TODO hpt */ - if (!gp->radix) { - rc = -EINVAL; - goto out_unlock; - } + if (radix) { /* Radix Invalidation */ + int shift, shadow_shift; + unsigned long addr; + long npages; - /* There may be more than one host page backing this single guest pte */ - do { - kvmhv_invalidate_shadow_pte_radix(vcpu, gp, addr, - &shadow_shift); + /* Radix invalidation but this is a hpt guest, nothing to do */ + if (!gp->radix) + goto out_unlock; + + shift = ap_to_shift(get_ap(rbval)); + addr = get_epn(rbval) << 12; + if (shift < 0) { /* Invalid ap encoding */ + rc = -EINVAL; + goto out_unlock; + } + + addr &= ~((1UL << shift) - 1); + npages = 1UL << (shift - PAGE_SHIFT); + /* There may be more than one host page backing this single guest pte */ + do { + kvmhv_invalidate_shadow_pte_radix(vcpu, gp, addr, + &shadow_shift); + + npages -= 1UL << (shadow_shift - PAGE_SHIFT); + addr += 1UL << shadow_shift; + } while (npages > 0); + } else { /* Hash Invalidation */ + int base_pshift = 12, actual_pshift = 12; + unsigned long ava, b = (rbval & TLBIE_RB_B) << TLBIE_RB_B_SHIFT; + + /* HPT invalidation but this is a radix guest, nothing to do */ + if (gp->radix) + goto out_unlock; + + /* Decode the rbval into ava, b, and base and actual pshifts */ + if (rbval & TLBIE_RB_L) { /* large base page size */ + unsigned long lp = rbval & TLBIE_RB_LP; + ava = (rbval & TLBIE_RB_AVA_L) | + ((rbval & TLBIE_RB_AVAL) << TLBIE_RB_AVAL_SHIFT); + + /* base and actual page size encoded in lp field */ + base_pshift = kvmppc_hpte_base_page_shift(HPTE_V_LARGE, + lp); + actual_pshift = kvmppc_hpte_actual_page_shift(HPTE_V_LARGE, + lp); + } else { /* !large base page size */ + int ap = get_ap(rbval); + ava = rbval & TLBIE_RB_AVA_4K; + + /* actual page size encoded in ap field */ + if (ap & 0x4) + actual_pshift = slb_base_page_shift[ap & 0x3]; + } - npages -= 1UL << (shadow_shift - PAGE_SHIFT); - addr += 1UL << shadow_shift; - } while (npages > 0); + kvmhv_tlbie_hpt_addr(gp, ava, base_pshift, actual_pshift, b); + } out_unlock: mutex_unlock(&gp->tlb_lock); @@ -1381,16 +1576,11 @@ static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, { struct kvm *kvm = vcpu->kvm; - /* XXX TODO hpt */ mutex_lock(&gp->tlb_lock); switch (ric) { case 0: /* Invalidate TLB */ - spin_lock(&kvm->mmu_lock); - kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, - gp->shadow_lpid); - spin_unlock(&kvm->mmu_lock); - kvmhv_flush_lpid(gp->shadow_lpid, gp->radix); + kvmhv_flush_nested(kvm, gp, false); break; case 1: /* @@ -1400,7 +1590,7 @@ static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, break; case 2: /* Invalidate TLB, PWC and caching of partition table entries */ - kvmhv_flush_nested(gp); + kvmhv_flush_nested(kvm, gp, true); break; default: break; @@ -1431,9 +1621,8 @@ static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, { struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *gp; - int r, ric, prs, is, ap; + int r, ric, prs, is; int lpid; - long epn; int ret = 0; ric = get_ric(instr); @@ -1444,14 +1633,28 @@ static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, /* * These cases are invalid and are not handled: - * r != 1 -> Only radix supported + * + * Radix: * prs == 1 -> Not HV privileged * ric == 3 -> No cluster bombs for radix * is == 1 -> Partition scoped translations not associated with pid * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA + * + * HPT: + * prs == 1 && ric != 2 -> Only process scoped caching is process table + * ric == 1 -> No page walk cache for HPT + * (!is) && ric == 2 -> Not supported by ISA + * ric == 3 -> Although cluster bombs are technically + * supported for is == 0, their encoding is + * implementation specific and linux doesn't + * use them, so we don't handle them for now. + * is == 1 -> HPT translations not associated with pid */ - if ((!r) || (prs) || (ric == 3) || (is == 1) || - ((!is) && (ric == 1 || ric == 2))) + if (r && ((prs) || (ric == 3) || (is == 1) || + ((!is) && (ric == 1 || ric == 2)))) + return -EINVAL; + else if (!r && ((prs && (ric != 2)) || (ric == 1) || + (!is && (ric == 2)) || (is == 1) || (ric == 3))) return -EINVAL; switch (is) { @@ -1460,9 +1663,7 @@ static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, * We know ric == 0 * Invalidate TLB for a given target address */ - epn = get_epn(rbval); - ap = get_ap(rbval); - ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn); + ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, r, rbval); break; case 2: /* Invalidate matching LPID */ -- 2.13.6