Re: [PATCH 2/4] add e500 tlb implementation

Christian Ehrhardt <ehrhardt@xxxxxxxxxxxxxxxxxx> · Thu, 21 Aug 2008 16:21:14 +0200

Liu Yu wrote:
Signed-off-by: Liu Yu <yu.liu@xxxxxxxxxxxxx>
---
 arch/powerpc/kvm/e500_tlb.c |  638 +++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/e500_tlb.h |  172 ++++++++++++
 2 files changed, 810 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/kvm/e500_tlb.c
 create mode 100644 arch/powerpc/kvm/e500_tlb.h

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
new file mode 100644
index 0000000..e89a952
--- /dev/null
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@xxxxxxxxxxxxx, Aug, 2008
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@xxxxxxxxxx>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/kvm_ppc.h>
+
+#include "e500_tlb.h"
+#include "inst.h"
+
+extern unsigned int tlbcam_index;
+extern unsigned int num_tlbcam_entries;
+
+#define get_tlbe(tlb, index) ((struct tlbe *)((tlb).base) + (index))
+
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+	struct tlbe *tlbe;
+	int i;
+
+	printk("| %2s | %3s | %8s | %8s | %8s |\n",
+			"nr", "mas1", "mas2", "mas3", "mas7");
+
+	printk("Guest TLB1:\n");
+	for (i = 0; i < KVM_FSL_BOOK_TLB1_SIZE; i++) {
+		tlbe = get_tlbe(vcpu->arch.guest_tlb[1], i);
+		if (tlbe->mas1 & MAS1_VALID)
+			printk(" G1%2d |  %08X | %08X | %08X | %08X |\n",
+			       i, tlbe->mas1, tlbe->mas2, tlbe->mas3,
+			       tlbe->mas7);
+	}
+	printk("Shadow TLB1:\n");
+	for (i = 0; i < KVM_FSL_BOOK_TLB1_SIZE; i++) {
+		tlbe = get_tlbe(vcpu->arch.shadow_tlb[1], i);
+		if (tlbe->mas1 & MAS1_VALID)
+			printk(" S1%2d |  %08X | %08X | %08X | %08X |\n",
+			       i, tlbe->mas1, tlbe->mas2, tlbe->mas3,
+			       tlbe->mas7);
+	}
+
+	printk("Guest TLB0:\n");
+	for (i = 0; i < KVM_FSL_BOOK_TLB0_SIZE; i++) {
+		tlbe = get_tlbe(vcpu->arch.guest_tlb[0], i);
+		if (tlbe->mas1 & MAS1_VALID)
+			printk(" G0%2d |  %08X | %08X | %08X | %08X |\n",
+			       i, tlbe->mas1, tlbe->mas2, tlbe->mas3,
+			       tlbe->mas7);
+	}
+	printk("Shadow TLB0:\n");
+	for (i = 0; i < KVM_FSL_BOOK_TLB0_SIZE; i++) {
+		tlbe = get_tlbe(vcpu->arch.shadow_tlb[0], i);
+		if (tlbe->mas1 & MAS1_VALID)
+			printk(" S0%2d |  %08X | %08X | %08X | %08X |\n",
+			       i, tlbe->mas1, tlbe->mas2, tlbe->mas3,
+			       tlbe->mas7);
+	}
+}

  
not the most important function but you might consider merging those for 
1 and 0 using array indexes.
like:

+	printk("Guest TLB%d:\n",tlbidx);
+	for (i = 0; i < KVM_FSL_BOOK_TLB_SIZES[tlbidx]; i++)

everything else is the same.
And addtionally - is the order 1->0 intended ?
+static unsigned int kvmppc_tlb1_pos;
+static unsigned int kvmppc_tlb1_bottom;
+static unsigned int kvmppc_tlb_nv[2];
+
+static inline unsigned int tlb_get_next_victim(struct kvm_vcpu *vcpu,
+			int tlbsel)
+{
+	unsigned int victim;
+	victim = kvmppc_tlb_nv[tlbsel]++;
+	if (kvmppc_tlb_nv[tlbsel] >= vcpu->arch.guest_tlb[tlbsel].size)
+		kvmppc_tlb_nv[tlbsel] = 0;
+
+	return victim;
+}
+
+static inline unsigned int tlb_cur_next_victim(struct kvm_vcpu *vcpu,
+			int tlbsel)
+{
+	return kvmppc_tlb_nv[tlbsel];
+}
+
+static inline u32 tlb_shadow_mas3_attrib(u32 mas3, int usermode)
+{
+	/* Mask off reserved bits. */
+	mas3 &= MAS3_ATTRIB_MASK;
+
+	if (!usermode) {
+		/* Guest is in supervisor mode,
+		 * so we need to translate guest
+		 * supervisor permissions into user permissions. */
+		mas3 &= ~E500_TLB_USER_PERM_MASK;
+		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
+	}
  
doesn't that allow your guest userspace access these region if it knows 
the address.
hmmm - I need to fetch a spec of the e500 to check the purpose of all 
that bits.

I would recommend to pass "struct kvm_vcpu*" and read the problem bit in 
this function instead of passing it in.
Some day you need something else than just userspace yes/no and then you 
can keep the interface.

+
+	return mas3 | E500_TLB_SUPER_PERM_MASK;
+}
+
+static inline u32 tlb_shadow_mas2_attrib(u32 mas2, int usermode)
+{
+	return mas2 & MAS2_ATTRIB_MASK;
+}
  

hmm the naming of both functions confuse me.
I don't think that there is such a thing as "to attrib" something. So I 
expect you use it as "get me the virtualization modified mas[23] attrib 
value which is only part of that 32 bit value". Well maybe there is no 
better name, but I would prefer something like "tlb_mask_mas2_attrib".

And btw - "int userspace" is not used in that function at all so you 
might want to drop it.

+
+/* Search the guest TLB for a matching entry. */
+static int kvmppc_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
+		int tlbsel, unsigned int pid, int as)
+{
+	int i;
+
+	/* XXX Replace loop with fancy data structures. */
+	for (i = 0; i < vcpu->arch.guest_tlb[tlbsel].size; i++) {
+		struct tlbe *tlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], i);
+		unsigned int tid;
+
+		if (eaddr < get_tlb_eaddr(tlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(tlbe))
+			continue;
+
+		tid = get_tlb_tid(tlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		if (get_tlb_ts(tlbe) != as && as != -1)
+			continue;
+
+		return i;
+	}
+
+	return -1;
+}
+
+struct tlbe *kvmppc_tlb_search(struct kvm_vcpu *vcpu, gva_t eaddr,
+		unsigned int pid, int as)
+{
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_tlb_index(vcpu, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+	}
+
+	return NULL;
+}
+
+static int tlbe_is_writable(struct tlbe *tlbe)
+{
+	return tlbe->mas3 & (MAS3_SW|MAS3_UW);
+}
+
+/*
+ * just for writing host TLB0 currently
+ */
+static void host_tlb_write_entry(int tlbsel, struct tlbe *stlbe)
+{
+	mtspr(SPRN_MAS1, stlbe->mas1);
+	mtspr(SPRN_MAS2, stlbe->mas2);
+	mtspr(SPRN_MAS3, stlbe->mas3);
+	mtspr(SPRN_MAS7, stlbe->mas7);
+	__asm__ __volatile__ ("tlbwe\n" : : );
+}
  

I expect this to be not irq safe if e.g. a dtlb miss occurs between the 
mtspr's.
You might ensure to disable interrupts in that function or at least 
place a "called irq's disabled" in the comment.


+
+/* Must be called with mmap_sem locked for writing. */
+static void kvmppc_shadow_release(struct kvm_vcpu *vcpu,
+		int tlbsel, int esel)
+{
+	struct tlbe *stlbe = get_tlbe(vcpu->arch.shadow_tlb[tlbsel], esel);
+	struct page *page = vcpu->arch.shadow_pages[tlbsel][esel];
+
+	if (page) {
+		vcpu->arch.shadow_pages[tlbsel][esel] = NULL;
+
+		if (get_tlb_v(stlbe)) {
+			if (tlbe_is_writable(stlbe))
+				kvm_release_page_dirty(page);
+			else
+				kvm_release_page_clean(page);
+		}
+	}
+}
+
+/* Must be called with mmap_sem locked for writing. */
+static void kvmppc_tlbe_invalidate(struct kvm_vcpu *vcpu,
+		int tlbsel, int esel)
+{
+	kvmppc_shadow_release(vcpu, tlbsel, esel);
+	get_tlbe(vcpu->arch.shadow_tlb[tlbsel], esel)->mas1 = 0;
+}
+
+static inline void shadow_map(struct kvm_vcpu *vcpu, u64 gvaddr,
+		gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
+{
+	struct page *new_page;
+	struct tlbe *stlbe;
+	hpa_t hpaddr;
+
+	stlbe = get_tlbe(vcpu->arch.shadow_tlb[tlbsel], esel);
+
+	/* Get reference to new page. */
+	down_write(&current->mm->mmap_sem);
+	new_page = gfn_to_page(vcpu->kvm, gfn);
+	if (is_error_page(new_page)) {
+		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
  

you miss a up_write to the current mmap_sem in this error case.


+		kvm_release_page_clean(new_page);
+		return;
+	}
+	hpaddr = page_to_phys(new_page);
+
+	/* Drop reference to old page. */
+	kvmppc_shadow_release(vcpu, tlbsel, esel);
+	up_write(&current->mm->mmap_sem);
+
+	vcpu->arch.shadow_pages[tlbsel][esel] = new_page;
+
+	/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K)
+		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN)
+		| tlb_shadow_mas2_attrib(gtlbe->mas2, vcpu->arch.msr & MSR_PR);
  
as mentioned above you might want to pass vcpu and read PR internally to 
keep the interface save.
Exept you want to use it somewhere where it is different from the curent 
vcpu->arch.msr state.
+	stlbe->mas3 = (hpaddr & MAS3_RPN)
+		| tlb_shadow_mas3_attrib(gtlbe->mas3, vcpu->arch.msr & MSR_PR);
+	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
+}
+
+/* XXX Map TLB0, just map the one-one case */
+void kvmppc_tlbe_map(struct kvm_vcpu *vcpu,
+		int tlbsel, int esel)
+{
+	struct tlbe *gtlbe;
+
+	gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+
+	shadow_map(vcpu, get_tlb_eaddr(gtlbe),
+			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
+			gtlbe, tlbsel, esel);
+}
+
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+/* XXX Map TLB1, for both one-one and one-to-many */
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
+		struct tlbe *gtlbe)
+{
+	unsigned int victim;
+
+	victim = kvmppc_tlb1_pos++;
+	if (kvmppc_tlb1_pos >= vcpu->arch.shadow_tlb[1].size)
+		kvmppc_tlb1_pos = kvmppc_tlb1_bottom;
+
+	shadow_map(vcpu, gvaddr, gfn, gtlbe, 1, victim);
+}
+
+void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
+		gva_t eend, u32 tid)
+{
+	unsigned int pid = tid & 0xff;
+	unsigned int i;
+
+	/* XXX Replace loop with fancy data structures. */
+	down_write(&current->mm->mmap_sem);
+	for (i = 0; i < vcpu->arch.guest_tlb[1].size; i++) {
+		struct tlbe *stlbe =
+			get_tlbe(vcpu->arch.shadow_tlb[1], i);
+		unsigned int tid;
+
+		if (!get_tlb_v(stlbe))
+			continue;
+
+		if (eend < get_tlb_eaddr(stlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(stlbe))
+			continue;
+
+		tid = get_tlb_tid(stlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		kvmppc_tlbe_invalidate(vcpu, 1, i);
+	}
+	up_write(&current->mm->mmap_sem);
+}
+
+/* Invalidate all guest kernel mappings when enter usermode,
+ * so that when they fault back in they will get the
+ * proper permission bits. */
+void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+{
+	int i;
+
+	if (usermode) {
+		/* XXX Replace loop with fancy data structures. */
+		down_write(&current->mm->mmap_sem);
+		for (i = 0; i < KVM_FSL_BOOK_TLB1_SIZE; i++)
+			kvmppc_tlbe_invalidate(vcpu, 1, i);
+		up_write(&current->mm->mmap_sem);
+		_tlbia();
+	}
+}
+
+void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
+{
+}
+
+/* Must be called with mmap_sem locked for writing. */
+static int guest_tlbe_invalidate(struct kvm_vcpu *vcpu, int tlbsel, int esel)
+{
+	struct tlbe *gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+
+	if (unlikely(get_tlb_iprot(gtlbe)))
+		return -1;
+
+	if (tlbsel == 1) {
+		kvmppc_mmu_invalidate(vcpu, get_tlb_eaddr(gtlbe),
+				get_tlb_end(gtlbe),
+				get_tlb_tid(gtlbe));
+	} else {
+		down_write(&current->mm->mmap_sem);
  
your comment mentiones mmap_sem needs to be locked for write when calling.
In this case this should deadlock or at least be a needless double call.
I expect that either comment or code are wrong by copy/paste.

+		kvmppc_tlbe_invalidate(vcpu, tlbsel, esel);
+		up_write(&current->mm->mmap_sem);
+	}
+
+	gtlbe->mas1 = 0;
+
+	return 0;
+}
+int kvmppc_emul_tlbivax(struct kvm_vcpu *vcpu, u32 inst)
+{
+	unsigned int ra, rb, ia;
+	int esel, tlbsel;
+	gva_t ea;
+
+	ra = get_ra(inst);
+	rb = get_rb(inst);
+	ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb];
+
+	ia = (ea >> 2) & 0x1;
+	tlbsel = (ea >> 3) & 0x3;
+
+	if (ia) {
+		/* invalidate all entries */
+		for (esel = 0; esel < vcpu->arch.guest_tlb[tlbsel].size; esel++)
+			guest_tlbe_invalidate(vcpu, tlbsel, esel);
+	} else {
+		ea &= 0xfffff000;
+		esel = kvmppc_tlb_index(vcpu, ea, tlbsel,
+				vcpu->arch.pid, -1);
+		if (esel >= 0)
+			guest_tlbe_invalidate(vcpu, tlbsel, esel);
+	}
+
+	_tlbia();
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emul_tlbre(struct kvm_vcpu *vcpu, u32 inst)
+{
+	int tlbsel, esel;
+	struct tlbe *gtlbe;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu);
+
+	if (esel >= vcpu->arch.guest_tlb[tlbsel].size) {
+		printk(KERN_ERR "%s: esel %d\n", __func__, esel);
+		kvmppc_dump_vcpu(vcpu);
+		return EMULATE_FAIL;
+	}
+
+	gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+	vcpu->arch.mas0 &= MAS0_NV(0);
+	vcpu->arch.mas0 |= MAS0_NV(tlb_cur_next_victim(vcpu, tlbsel));
+	vcpu->arch.mas1 = gtlbe->mas1;
+	vcpu->arch.mas2 = gtlbe->mas2;
+	vcpu->arch.mas3 = gtlbe->mas3;
+	vcpu->arch.mas7 = gtlbe->mas7;
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emul_tlbsx(struct kvm_vcpu *vcpu, u32 inst)
+{
+	int as = !!get_cur_sas(vcpu);
+	unsigned int pid = get_cur_spid(vcpu);
+	unsigned int rb;
+	int esel, tlbsel;
+	struct tlbe *gtlbe = NULL;
+	gva_t ea;
+
+	rb = get_rb(inst);
+
+	ea = vcpu->arch.gpr[rb];
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_tlb_index(vcpu, ea, tlbsel, pid, as);
+		if (esel >= 0) {
+			gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+			break;
+		}
+	}
+
+	if (gtlbe) {
+		vcpu->arch.mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
+			| MAS0_NV(get_tlb_nv(vcpu));
+		vcpu->arch.mas1 = gtlbe->mas1;
+		vcpu->arch.mas2 = gtlbe->mas2;
+		vcpu->arch.mas3 = gtlbe->mas3;
+		vcpu->arch.mas7 = gtlbe->mas7;
+	} else {
+		int esel;
+		unsigned int nv;
+
+		tlbsel = vcpu->arch.mas4 >> 28 & 0x3;
+		esel = tlb_get_next_victim(vcpu, tlbsel);
+		nv = tlb_cur_next_victim(vcpu, tlbsel);
+
+		vcpu->arch.mas0 = MAS0_TLBSEL(tlbsel)
+			| MAS0_ESEL(esel) | MAS0_NV(nv);
+		vcpu->arch.mas1 = (vcpu->arch.mas6 & MAS6_SPID0)
+			| (vcpu->arch.mas6 & (MAS6_SAS ? MAS1_TS : 0))
+			| (vcpu->arch.mas4 & MAS4_TSIZED(~0));
+		vcpu->arch.mas2 &= MAS2_EPN;
+		vcpu->arch.mas2 |= vcpu->arch.mas4 & MAS2_ATTRIB_MASK;
+		vcpu->arch.mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+		vcpu->arch.mas7 = 0;
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
+{
+	u64 eaddr;
+	u64 raddr;
+	u32 tid;
+	struct tlbe *gtlbe;
+	int esel, tlbsel;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu);
+
+	if (esel >= vcpu->arch.guest_tlb[tlbsel].size) {
+		printk(KERN_ERR "%s: esel %d\n", __func__, esel);
+		kvmppc_dump_vcpu(vcpu);
+		return EMULATE_FAIL;
+	}
+
+	gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+
+	if (get_tlb_v(gtlbe) && tlbsel == 1) {
+		eaddr = get_tlb_eaddr(gtlbe);
+		tid = get_tlb_tid(gtlbe);
+		kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(gtlbe), tid);
+	}
+
+	gtlbe->mas1 = vcpu->arch.mas1;
+	gtlbe->mas2 = vcpu->arch.mas2;
+	gtlbe->mas3 = vcpu->arch.mas3;
+	gtlbe->mas7 = vcpu->arch.mas7;
+
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+	if (tlbe_is_host_safe(vcpu, gtlbe)) {
+		switch (tlbsel) {
+		case 0: {
+#ifdef KVMPPC_E500_TLB0_ENABLE
  
I would expect putting the ifdef around the case is preferred (not sure)
+			/* TLB0 */
+			gtlbe->mas1 &= ~MAS1_TSIZE(~0);
+			gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K);
+
+			kvmppc_tlbe_map(vcpu, 0, esel);
+
+			host_tlb_write_entry(0,
+				get_tlbe(vcpu->arch.shadow_tlb[0], esel));
+			break;
+#endif
+		}
+		case 1:
+			/* TLB1 */
+			eaddr = get_tlb_eaddr(gtlbe);
+			raddr = get_tlb_raddr(gtlbe);
+
+			/* Create a 4KB mapping on the host.
+			 * If the guest wanted a large page,
+			 * only the first 4KB is mapped here and the rest
+			 * are mapped on the fly. */
+			kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, gtlbe);
+			break;
+		}
  
is everything else than 0/1 a noop ?
If not a default with a BUG() or something like that would be nice.

+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_handle_tlb_miss(struct kvm_vcpu *vcpu , gva_t eaddr, int eas)
+{
+	struct tlbe *gtlbe = NULL;
+	gfn_t gfn;
+	int as = !!(vcpu->arch.msr & eas);
+	int esel, tlbsel;
+
+	/* Check the guest TLB. */
+	/* XXX tlb1 first to fast it */
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_tlb_index(vcpu, eaddr, tlbsel,
+				get_cur_pid(vcpu), as);
+		if (esel >= 0) {
+			gtlbe = get_tlbe(vcpu->arch.guest_tlb[tlbsel], esel);
+			break;
+		}
+	}
+
+	if (!gtlbe) {
+		/* The guest didn't have a mapping for it.
+		 * Update mas.
+		 */
+		unsigned int nv, pidsel, tsized;
+
+		tlbsel = (vcpu->arch.mas4 >> 28) & 0x3;
+		esel = tlb_get_next_victim(vcpu, tlbsel);
+		nv = tlb_cur_next_victim(vcpu, tlbsel);
+		pidsel = (vcpu->arch.mas4 >> 16) & 0xf;
+		tsized = (vcpu->arch.mas4 >> 8) & 0xf;
+
+		vcpu->arch.mas0 = MAS0_TLBSEL(tlbsel)
+			| MAS0_ESEL(esel) | MAS0_NV(nv);
+		vcpu->arch.mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
+			| MAS1_TID(vcpu->arch.pid_array[pidsel])
+			| MAS1_TSIZE(tsized);
+		vcpu->arch.mas2 = (eaddr & MAS2_EPN)
+			| (vcpu->arch.mas4 & MAS2_ATTRIB_MASK);
+		vcpu->arch.mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+		vcpu->arch.mas6 = (vcpu->arch.mas6 & MAS6_SPID1)
+			| (vcpu->arch.pid << 16)
+			| (as ? MAS6_SAS : 0);
+		vcpu->arch.mas7 = 0;

  
I think I need the e500 spec again, but is it right that you do not 
deliver an interrupt if a dtlb miss occures.
This is the case where it is right that a dtlb miss occurs, because it 
did not have a mapping for it.
I  would expect either the equivalent of an hardware tlb handling (fix 
it) or irq delivery to the guest.


+		return -1;
+	}
+
+	vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
+	gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
+
+	if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+		/*
+		 * The guest TLB had a mapping, but the shadow TLB
+		 * didn't, and it is RAM. This could be because:
+		 * a) tlb1: the entry is mapping the host kernel
+		 * b) tlb1: the guest used a large mapping
+		 *    which we're faking Either way,
+		 * c) tlb0: the entry has been overwritten by host
+		 * we need to satisfy the fault without invoking the guest.
+		 * */
+		switch (tlbsel) {
+		case 0: {
+#ifdef KVMPPC_E500_TLB0_ENABLE
  
ifdef around the case again if the ifdef covers the whole case and there 
is another case that keeps the compiler from complaining about no case 
statement.
+			struct tlbe *stlbe
+				= get_tlbe(vcpu->arch.shadow_tlb[tlbsel], esel);
+			host_tlb_write_entry(tlbsel, stlbe);
+			break;
+#endif
+		}
+		case 1:
+			kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe);
+			break;
+		}
+	} else {
+		/* Guest has mapped and accessed a page which is not
+		 * actually RAM. */
+		return 1;
+	}
+
+	return 0;
+}

  
You return 0, -1 and 1 in this function. I assume the 1 should be -1 too.

+void kvmppc_tlb_setup(struct kvm_vcpu *vcpu)
+{
+	struct tlbe *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = get_tlbe(vcpu->arch.guest_tlb[1], 0);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
+	tlbe->mas7 = 0;
+
+	/* 4K map for serial output */
+	tlbe = get_tlbe(vcpu->arch.guest_tlb[1], 1);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+	tlbe->mas7 = 0;
+}

  
Is that serial output mapping a thing a bootloader inserts on bare metal 
e500 cores ?
Only if not either
-> who inserts this mapping on bare metal e500
or
-> why do you need to insert it for a guest

+void kvmppc_tlb_init(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.guest_tlb[0].size = KVM_FSL_BOOK_TLB0_SIZE;
+	vcpu->arch.guest_tlb[0].base =
+	    kzalloc(sizeof(struct tlbe) * KVM_FSL_BOOK_TLB0_SIZE, GFP_KERNEL);
+	vcpu->arch.guest_tlb[1].size = KVM_FSL_BOOK_TLB1_SIZE;
+	vcpu->arch.guest_tlb[1].base =
+	    kzalloc(sizeof(struct tlbe) * KVM_FSL_BOOK_TLB1_SIZE, GFP_KERNEL);
+	vcpu->arch.shadow_tlb[0].size = KVM_FSL_BOOK_TLB0_SIZE;
+	vcpu->arch.shadow_tlb[0].base =
+	    kzalloc(sizeof(struct tlbe) * KVM_FSL_BOOK_TLB0_SIZE, GFP_KERNEL);
+	vcpu->arch.shadow_tlb[1].size = KVM_FSL_BOOK_TLB1_SIZE;
+	vcpu->arch.shadow_tlb[1].base =
+	    kzalloc(sizeof(struct tlbe) * KVM_FSL_BOOK_TLB1_SIZE, GFP_KERNEL);
+	vcpu->arch.shadow_pages[0] = (struct page **)
+	    kzalloc(sizeof(struct page *) * KVM_FSL_BOOK_TLB0_SIZE, GFP_KERNEL);
+	vcpu->arch.shadow_pages[1] = (struct page **)
+	    kzalloc(sizeof(struct page *) * KVM_FSL_BOOK_TLB1_SIZE, GFP_KERNEL);
+
+	if (num_tlbcam_entries - tlbcam_index > KVM_FSL_BOOK_TLB1_SIZE)
+		kvmppc_tlb1_bottom = 0;
+	else
+		kvmppc_tlb1_bottom = tlbcam_index + KVM_FSL_BOOK_TLB1_SIZE
+			- num_tlbcam_entries;
+
+	kvmppc_tlb1_pos = kvmppc_tlb1_bottom;
+
+	return;
+}
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
new file mode 100644
index 0000000..13fcab6
--- /dev/null
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@xxxxxxxxxxxxx, Aug, 2008
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.h,
+ * by Hollis Blanchard <hollisb@xxxxxxxxxx>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __KVM_E500_TLB_H__
+#define __KVM_E500_TLB_H__
+
+#include <linux/kvm_host.h>
+#include <asm/mmu-fsl-booke.h>
+#include <asm/tlb.h>
+
+#define KVM_FSL_BOOK_TLB0_SIZE  128
+#define KVM_FSL_BOOK_TLB1_SIZE  8
+
+#define KVMPPC_E500_TLB0_ENABLE
+
+struct tlbe {
+	u32 mas1;
+	u32 mas2;
+	u32 mas3;
+	u32 mas7;
+};
+
+#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
+#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
+#define MAS2_ATTRIB_MASK \
+	  (MAS2_X0 | MAS2_X1 | MAS2_W | MAS2_I | MAS2_M | MAS2_G | MAS2_E)
+#define MAS3_ATTRIB_MASK \
+	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
+	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
+
+#define mtdcr(rn, val)
  

define mtdcr to what ?
+#define mfdcr(rn)	0
+
+extern int kvmppc_emul_tlbwe(struct kvm_vcpu *, u32);
+extern int kvmppc_emul_tlbre(struct kvm_vcpu *, u32);
+extern int kvmppc_emul_tlbivax(struct kvm_vcpu *, u32);
+extern int kvmppc_emul_tlbsx(struct kvm_vcpu *, u32);
+extern int kvmppc_handle_tlb_miss(struct kvm_vcpu *, gva_t, int);
+extern struct tlbe *kvmppc_tlb_search(struct kvm_vcpu *,
+		gva_t, unsigned int, int);
+extern void kvmppc_tlb_setup(struct kvm_vcpu *);
+extern void kvmppc_tlb_init(struct kvm_vcpu *);
+extern void kvmppc_dump_tlbs(struct kvm_vcpu *);
+
+/* TLB helper functions */
+static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 8) & 0xf;
+}
+
+static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+{
+	return tlbe->mas2 & 0xfffff000;
+}
+
+static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
+{
+	unsigned int pgsize = get_tlb_size(tlbe);
+	return 1ULL << 10 << (pgsize << 1);
+}
+
+static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+{
+	u64 bytes = get_tlb_bytes(tlbe);
+	return get_tlb_eaddr(tlbe) + bytes - 1;
+}
+
+static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+{
+	u64 rpn = tlbe->mas7;
+	return (rpn << 32) | (tlbe->mas3 & 0xfffff000);
+}
+
+static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 16) & 0xff;
+}
+
+static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 12) & 0x1;
+}
+
+static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 31) & 0x1;
+}
+
+static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 30) & 0x1;
+}
+
+static inline unsigned int get_cur_pid(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pid & 0xff;
+}
+
+static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.mas6 >> 16) & 0xff;
+}
+
+static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mas6 & 0x1;
+}
+
+static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.mas0 >> 28) & 0x3;
+}
+
+static inline unsigned int get_tlb_nv(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mas0 & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.mas0 >> 16) & 0xfff;
+}
+
+static inline gpa_t tlb_xlate(struct tlbe *tlbe, gva_t eaddr)
+{
+	u64 pgmask = get_tlb_bytes(tlbe) - 1;
+	return get_tlb_raddr(tlbe) | (eaddr & pgmask);
+}
+
+static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+			const struct tlbe *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+		return 0;
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+static inline void kvmppc_tlb_load(struct kvm_vcpu *vcpu)
+{
+}
+
+
+static inline void kvmppc_tlb_put(struct kvm_vcpu *vcpu)
+{
+	_tlbia();
+}
+
+#endif /* __KVM_POWERPC_TLB_H__ */
  

you defined it as  __KVM_E500_TLB_H__ above, this might be the old 440 
comment from Hollis

--

Grüsse / regards, 
Christian Ehrhardt
IBM Linux Technology Center, Open Virtualization

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html