On 08.09.2010, at 11:40, Liu Yu wrote: > Current guest TLB1 is mapped to host TLB1. > As host kernel only provides 4K uncontinuous pages, > we have to break guest large mapping into 4K shadow mappings. > These 4K shadow mappings are then mapped into host TLB1 on fly. > As host TLB1 only has 13 free entries, there's serious tlb miss. > > Since e500v2 has a big number of TLB0 entries, > it should be help to map those 4K shadow mappings to host TLB0. > To achieve this, we need to unlink guest tlb and host tlb, > So that guest TLB1 mappings can route to any host TLB0 entries freely. > > Pages/mappings are considerred in the same kind as host tlb entry. > This patch remove the link between pages and guest tlb entry to do the unlink. > And keep host_tlb0_ref in each vcpu to trace pages. > Then it's easy to map guest TLB1 to host TLB0. > > In guest ramdisk boot test(guest mainly uses TLB1), > with this patch, the tlb miss number get down 90%. > > Signed-off-by: Liu Yu <yu.liu@xxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_e500.h | 7 +- > arch/powerpc/kvm/e500.c | 4 + > arch/powerpc/kvm/e500_tlb.c | 280 ++++++++++++----------------------- > arch/powerpc/kvm/e500_tlb.h | 1 + > 4 files changed, 104 insertions(+), 188 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h > index cb785f9..16c0ed0 100644 > --- a/arch/powerpc/include/asm/kvm_e500.h > +++ b/arch/powerpc/include/asm/kvm_e500.h > @@ -37,13 +37,10 @@ struct tlbe_ref { > struct kvmppc_vcpu_e500 { > /* Unmodified copy of the guest's TLB. */ > struct tlbe *guest_tlb[E500_TLB_NUM]; > - /* TLB that's actually used when the guest is running. */ > - struct tlbe *shadow_tlb[E500_TLB_NUM]; > - /* Pages which are referenced in the shadow TLB. */ > - struct tlbe_ref *shadow_refs[E500_TLB_NUM]; > + /* Pages which are referenced in host TLB. */ > + struct tlbe_ref *host_tlb0_ref; > > unsigned int guest_tlb_size[E500_TLB_NUM]; > - unsigned int shadow_tlb_size[E500_TLB_NUM]; > unsigned int guest_tlb_nv[E500_TLB_NUM]; > > u32 host_pid[E500_PID_NUM]; > diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c > index e8a00b0..14af6d7 100644 > --- a/arch/powerpc/kvm/e500.c > +++ b/arch/powerpc/kvm/e500.c > @@ -146,6 +146,10 @@ static int __init kvmppc_e500_init(void) > if (r) > return r; > > + r = kvmppc_e500_mmu_init(); > + if (r) > + return r; > + > /* copy extra E500 exception handlers */ > ivor[0] = mfspr(SPRN_IVOR32); > ivor[1] = mfspr(SPRN_IVOR33); > diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c > index 0b657af..a6c2320 100644 > --- a/arch/powerpc/kvm/e500_tlb.c > +++ b/arch/powerpc/kvm/e500_tlb.c > @@ -25,9 +25,15 @@ > #include "e500_tlb.h" > #include "trace.h" > > -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) > +static unsigned int host_tlb0_entry_num; > +static unsigned int host_tlb0_assoc; > +static unsigned int host_tlb0_assoc_bit; bits. > > -static unsigned int tlb1_entry_num; > +static inline unsigned int get_tlb0_entry_offset(u32 eaddr, u32 esel) > +{ > + return ((eaddr & 0x7F000) >> (12 - host_tlb0_assoc_bit) | > + (esel & (host_tlb0_assoc - 1))); > +} > > void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) > { > @@ -62,11 +68,6 @@ static inline unsigned int tlb0_get_next_victim( > return victim; > } > > -static inline unsigned int tlb1_max_shadow_size(void) > -{ > - return tlb1_entry_num - tlbcam_index; > -} > - > static inline int tlbe_is_writable(struct tlbe *tlbe) > { > return tlbe->mas3 & (MAS3_SW|MAS3_UW); > @@ -100,7 +101,7 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) > /* > * writing shadow tlb entry to host TLB > */ > -static inline void __write_host_tlbe(struct tlbe *stlbe) > +static inline void __host_tlbe_write(struct tlbe *stlbe) > { > mtspr(SPRN_MAS1, stlbe->mas1); > mtspr(SPRN_MAS2, stlbe->mas2); > @@ -109,25 +110,22 @@ static inline void __write_host_tlbe(struct tlbe *stlbe) > __asm__ __volatile__ ("tlbwe\n" : : ); > } > > -static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, > - int tlbsel, int esel, struct tlbe *stlbe) > +static inline u32 host_tlb0_write(struct kvmppc_vcpu_e500 *vcpu_e500, > + u32 gvaddr, struct tlbe *stlbe) > { > - local_irq_disable(); > - if (tlbsel == 0) { > - __write_host_tlbe(stlbe); > - } else { > - unsigned register mas0; > + unsigned register mas0; > > - mas0 = mfspr(SPRN_MAS0); > + local_irq_disable(); Do you have to disable interrupts for a tlb write? If you get preempted before the write, the entry you overwrite could be different. But you don't protect against that either way. And if you get preempted afterwards, you could lose the entry. But since you enable interrupts beyond that, that could happen either way too. So what's the reason for the disable here? > > - mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); > - __write_host_tlbe(stlbe); > + mas0 = mfspr(SPRN_MAS0); > + __host_tlbe_write(stlbe); > > - mtspr(SPRN_MAS0, mas0); > - } > local_irq_enable(); > - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, > + > + trace_kvm_stlb_write(mas0, stlbe->mas1, stlbe->mas2, > stlbe->mas3, stlbe->mas7); > + > + return (mas0 & 0x0FFF0000) >> 16; > } > > void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) > @@ -172,41 +170,17 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, > return -1; > } > > -static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, > - int stlbsel, int sesel) > +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) > { > - struct tlbe_ref *ref; > - struct page *page; > - > - ref = &vcpu_e500->shadow_refs[stlbsel][sesel]; > - page = ref->page; > - > - if (page) { > - ref->page = NULL; > + struct page *pg = ref->page; > > - if (get_tlb_v(ref->gtlbe)) { > - if (tlbe_is_writable(ref->gtlbe)) > - kvm_release_page_dirty(page); > - else > - kvm_release_page_clean(page); > - } > + if (pg) { > + if (get_tlb_v(ref->gtlbe) && tlbe_is_writable(ref->gtlbe)) > + kvm_release_page_dirty(pg); > + else > + kvm_release_page_clean(pg); > } > -} > > -static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, > - int esel) > -{ > - struct tlbe stlbe; > - unsigned int i; > - > - stlbe.mas1 = 0; > - for (i = 0; i < KVM_E500_TLB1_SIZE; i++) { > - struct tlbe_ref *ref = > - &vcpu_e500->shadow_refs[1][i]; > - > - if (ref->gtlbe == &vcpu_e500->guest_tlb[1][esel]) > - write_host_tlbe(vcpu_e500, 1, i, &stlbe); > - } > } > > static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, > @@ -236,77 +210,6 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, > vcpu_e500->mas7 = 0; > } > > -static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, > - struct tlbe *gtlbe, struct tlbe_ref *ref, > - u64 gvaddr, struct tlbe *stlbe) > -{ > - hpa_t hpaddr = page_to_phys(ref->page); > - > - /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ > - stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) > - | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; > - stlbe->mas2 = (gvaddr & MAS2_EPN) > - | e500_shadow_mas2_attrib(gtlbe->mas2, > - vcpu_e500->vcpu.arch.msr & MSR_PR); > - stlbe->mas3 = (hpaddr & MAS3_RPN) > - | e500_shadow_mas3_attrib(gtlbe->mas3, > - vcpu_e500->vcpu.arch.msr & MSR_PR); > - stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; > -} > - > -static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, > - gfn_t gfn, struct tlbe *gtlbe, int stlbsel, int sesel) > -{ > - struct page *new_page; > - struct tlbe_ref *ref; > - > - /* Get reference to new page. */ > - new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); > - if (is_error_page(new_page)) { > - printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); > - kvm_release_page_clean(new_page); > - return; > - } > - > - /* Drop reference to old page. */ > - kvmppc_e500_shadow_release(vcpu_e500, stlbsel, sesel); > - > - ref = &vcpu_e500->shadow_refs[stlbsel][sesel]; > - ref->page = new_page; > - ref->gtlbe = gtlbe; > -} > - > -/* XXX only map the one-one case, for now use TLB0 */ > -static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel) > -{ > - struct tlbe *gtlbe; > - > - gtlbe = &vcpu_e500->guest_tlb[0][esel]; > - > - kvmppc_e500_shadow_map(vcpu_e500, get_tlb_raddr(gtlbe) >> PAGE_SHIFT, > - gtlbe, 0, esel); > - > - return esel; > -} > - > -/* Caller must ensure that the specified guest TLB entry is safe to insert into > - * the shadow TLB. */ > -/* XXX for both one-one and one-to-many , for now use TLB1 */ > -static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, > - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) > -{ > - unsigned int victim; > - > - victim = vcpu_e500->guest_tlb_nv[1]++; > - > - if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) > - vcpu_e500->guest_tlb_nv[1] = 0; > - > - kvmppc_e500_shadow_map(vcpu_e500, gfn, gtlbe, 1, victim); > - > - return victim; > -} > - > /* Invalidate all guest kernel mappings when enter usermode, > * so that when they fault back in they will get the > * proper permission bits. */ > @@ -445,6 +348,52 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) > return EMULATE_DONE; > } > > +static int kvmppc_e500_mmu_map_tlb0(struct kvmppc_vcpu_e500 *vcpu_e500, > + struct tlbe *gtlbe, > + gfn_t gfn, > + u64 gvaddr) > +{ > + struct page *new_page; > + struct tlbe_ref *ref; > + u32 esel; > + hpa_t hpaddr; > + struct tlbe stlbe; > + > + /* Find related page. */ > + new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); > + if (is_error_page(new_page)) { > + printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", gfn); > + kvm_release_page_clean(new_page); > + return -1; > + } > + > + hpaddr = page_to_phys(new_page); > + > + /* Setup the shadow tlb entry */ > + /* Force TS=1 IPROT=0 TSIZE=4KB for all mappings go to TLB0. */ > + stlbe.mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) > + | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; > + stlbe.mas2 = (gvaddr & MAS2_EPN) > + | e500_shadow_mas2_attrib(gtlbe->mas2, > + vcpu_e500->vcpu.arch.msr & MSR_PR); > + stlbe.mas3 = (hpaddr & MAS3_RPN) > + | e500_shadow_mas3_attrib(gtlbe->mas3, > + vcpu_e500->vcpu.arch.msr & MSR_PR); > + stlbe.mas7 = (hpaddr >> 32) & MAS7_RPN; > + > + esel = host_tlb0_write(vcpu_e500, gvaddr, &stlbe); > + > + ref = &vcpu_e500->host_tlb0_ref[get_tlb0_entry_offset(gvaddr, esel)]; > + > + /* Release old page and setup new page */ > + kvmppc_e500_ref_release(ref); > + > + ref->page = new_page; > + ref->gtlbe = gtlbe; > + > + return 0; > +} > + > int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) > { > struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); > @@ -457,7 +406,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) > gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; > > if (get_tlb_v(gtlbe) && tlbsel == 1) > - kvmppc_e500_tlb1_invalidate(vcpu_e500, esel); > + _tlbil_all(); > > gtlbe->mas1 = vcpu_e500->mas1; > gtlbe->mas2 = vcpu_e500->mas2; > @@ -469,27 +418,15 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) > > /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ > if (tlbe_is_host_safe(vcpu, gtlbe)) { > - struct tlbe stlbe; > - int stlbsel, sesel; > - struct tlbe_ref *ref; > - u64 eaddr; > - > switch (tlbsel) { > case 0: > /* TLB0 */ > gtlbe->mas1 &= ~MAS1_TSIZE(~0); > gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); > > - eaddr = get_tlb_eaddr(gtlbe); > - stlbsel = 0; > - sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel); > - > - ref = &vcpu_e500->shadow_refs[stlbsel][sesel]; > - > - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, > - ref, eaddr, &stlbe); > - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); > - > + kvmppc_e500_mmu_map_tlb0(vcpu_e500, gtlbe, > + get_tlb_raddr(gtlbe) >> PAGE_SHIFT, > + get_tlb_eaddr(gtlbe)); > break; > > case 1: > @@ -552,37 +489,15 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, > unsigned int index) > { > struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); > - struct tlbe_ref *ref; > - struct tlbe *gtlbe, stlbe; > + struct tlbe *gtlbe; > int tlbsel = tlbsel_of(index); > int esel = esel_of(index); > - int stlbsel, sesel; > + gfn_t gfn; > > gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; > + gfn = gpaddr >> PAGE_SHIFT; > > - switch (tlbsel) { > - case 0: > - stlbsel = 0; > - sesel = esel; > - break; > - > - case 1: { > - gfn_t gfn = gpaddr >> PAGE_SHIFT; > - > - stlbsel = 1; > - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); > - break; > - } > - > - default: > - BUG(); > - break; > - } > - > - ref = &vcpu_e500->shadow_refs[stlbsel][sesel]; > - > - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, ref, eaddr, &stlbe); > - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); > + kvmppc_e500_mmu_map_tlb0(vcpu_e500, gtlbe, gfn, eaddr); > } > > int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, > @@ -621,8 +536,6 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) > > int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) > { > - tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; > - > vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; > vcpu_e500->guest_tlb[0] = > kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); > @@ -635,16 +548,10 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) > if (vcpu_e500->guest_tlb[1] == NULL) > goto err_out_guest0; > > - vcpu_e500->shadow_refs[0] = (struct tlbe_ref *) > - kzalloc(sizeof(struct tlbe_ref) * KVM_E500_TLB0_SIZE, GFP_KERNEL); > - if (vcpu_e500->shadow_refs[0] == NULL) > + vcpu_e500->host_tlb0_ref = kzalloc(sizeof(struct tlbe_ref) * host_tlb0_entry_num, GFP_KERNEL); > + if (vcpu_e500->host_tlb0_ref == NULL) > goto err_out_guest1; > > - vcpu_e500->shadow_refs[1] = (struct tlbe_ref *) > - kzalloc(sizeof(struct tlbe_ref) * KVM_E500_TLB1_SIZE, GFP_KERNEL); > - if (vcpu_e500->shadow_refs[1] == NULL) > - goto err_out_ref0; > - > /* Init TLB configuration register */ > vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; > vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; > @@ -653,8 +560,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) > > return 0; > > -err_out_ref0: > - kfree(vcpu_e500->shadow_refs[0]); > err_out_guest1: > kfree(vcpu_e500->guest_tlb[1]); > err_out_guest0: > @@ -665,18 +570,27 @@ err_out: > > void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) > { > - int stlbsel, i; > + int i; > > /* release all pages */ > - for (stlbsel = 0; stlbsel < 2; stlbsel++) > - for (i = 0; i < vcpu_e500->guest_tlb_size[stlbsel]; i++) > - kvmppc_e500_shadow_release(vcpu_e500, stlbsel, i); > + for (i = 0; i < host_tlb0_entry_num; i++) { > + struct tlbe_ref *ref = &vcpu_e500->host_tlb0_ref[i]; > + > + kvmppc_e500_ref_release(ref); > + } > > /* discard all guest mapping */ > _tlbil_all(); > > - kfree(vcpu_e500->shadow_refs[1]); > - kfree(vcpu_e500->shadow_refs[0]); > kfree(vcpu_e500->guest_tlb[1]); > kfree(vcpu_e500->guest_tlb[0]); > } > + > +int kvmppc_e500_mmu_init(void) > +{ > + host_tlb0_entry_num = mfspr(SPRN_TLB0CFG) & 0xFFF; > + host_tlb0_assoc = (mfspr(SPRN_TLB0CFG) >> 24) & 0xFF; > + for (; host_tlb0_assoc >> (host_tlb0_assoc_bit + 1); host_tlb0_assoc_bit++); Ugh. While loop please. The rest looks good from what I can tell - not being a BookE MMU expert that is. I'm still worried about the userspace switch which results in a complete MMU flush. Couldn't we just allocate a new PID for that? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html