Re: [RFC PATCH 24/32] KVM: PPC: Book3S HV: Handle page fault for a nested guest

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Sep 21, 2018 at 08:01:55PM +1000, Paul Mackerras wrote:
> From: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx>
> 
> Consider a normal (L1) guest running under the main hypervisor (L0),
> and then a nested guest (L2) running under the L1 guest which is acting
> as a nested hypervisor. L0 has page tables to map the address space for
> L1 providing the translation from L1 real address -> L0 real address;
> 
> 	L1
> 	|
> 	| (L1 -> L0)
> 	|
> 	----> L0
> 
> There are also page tables in L1 used to map the address space for L2
> providing the translation from L2 real address -> L1 read address. Since
> the hardware can only walk a single level of page table, we need to
> maintain in L0 a "shadow_pgtable" for L2 which provides the translation
> from L2 real address -> L0 real address. Which looks like;
> 
> 	L2				L2
> 	|				|
> 	| (L2 -> L1)			|
> 	|				|
> 	----> L1			| (L2 -> L0)
> 	      |				|
> 	      | (L1 -> L0)		|
> 	      |				|
> 	      ----> L0			--------> L0
> 
> When a page fault occurs while running a nested (L2) guest we need to
> insert a pte into this "shadow_pgtable" for the L2 -> L0 mapping. To
> do this we need to:
> 
> 1. Walk the pgtable in L1 memory to find the L2 -> L1 mapping, and
>    provide a page fault to L1 if this mapping doesn't exist.
> 2. Use our L1 -> L0 pgtable to convert this L1 address to an L0 address,
>    or try to insert a pte for that mapping if it doesn't exist.
> 3. Now we have a L2 -> L0 mapping, insert this into our shadow_pgtable
> 
> Once this mapping exists we can take rc faults when hardware is unable
> to automatically set the reference and change bits in the pte. On these
> we need to:
> 
> 1. Check the rc bits on the L2 -> L1 pte match, and otherwise reflect
>    the fault down to L1.
> 2. Set the rc bits in the L1 -> L0 pte which corresponds to the same
>    host page.
> 3. Set the rc bits in the L2 -> L0 pte.
> 
> As we reuse a large number of functions in book3s_64_mmu_radix.c for
> this we also needed to refactor a number of these functions to take
> an lpid parameter so that the correct lpid is used for tlb invalidations.
> The functionality however has remained the same.
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx>
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx>

This would be easier to read/review with the addition of the lpid
parameters and the rework of kvmppc_mmu_radix_translate_table() split
out.  I'm not sure it's worth the effort of doing so at this point,
though.

> ---
>  .../powerpc/include/asm/book3s/64/tlbflush-radix.h |   1 +
>  arch/powerpc/include/asm/kvm_book3s.h              |  19 ++
>  arch/powerpc/include/asm/kvm_book3s_64.h           |   4 +
>  arch/powerpc/include/asm/kvm_host.h                |   2 +
>  arch/powerpc/kvm/book3s_64_mmu_radix.c             | 196 ++++++------
>  arch/powerpc/kvm/book3s_hv_nested.c                | 328 ++++++++++++++++++++-
>  arch/powerpc/mm/tlb-radix.c                        |   9 +
>  7 files changed, 472 insertions(+), 87 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 1154a6d..671316f 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
>  					unsigned long addr,
>  					unsigned long page_size);
>  extern void radix__flush_pwc_lpid(unsigned int lpid);
> +extern void radix__flush_tlb_lpid(unsigned int lpid);
>  extern void radix__local_flush_tlb_lpid(unsigned int lpid);
>  extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
>  
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index a22a501..664e1fb 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -188,17 +188,34 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
>  extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
>  			struct kvm_vcpu *vcpu,
>  			unsigned long ea, unsigned long dsisr);
> +extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
> +				      struct kvmppc_pte *gpte, u64 root,
> +				      u64 *pte_ret_p);
>  extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			struct kvmppc_pte *gpte, u64 table,
>  			int table_index, u64 *pte_ret_p);
>  extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			struct kvmppc_pte *gpte, bool data, bool iswrite);
> +extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
> +				    bool writing, unsigned long gpa,
> +				    unsigned int lpid);
> +extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
> +				unsigned long gpa,
> +				struct kvm_memory_slot *memslot,
> +				bool writing, bool kvm_ro,
> +				pte_t *inserted_pte, unsigned int *levelp);
>  extern int kvmppc_init_vm_radix(struct kvm *kvm);
>  extern void kvmppc_free_radix(struct kvm *kvm);
> +extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
> +				      unsigned int lpid);
>  extern int kvmppc_radix_init(void);
>  extern void kvmppc_radix_exit(void);
>  extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
>  			unsigned long gfn);
> +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> + 			     unsigned long gpa, unsigned int shift,
> +			     struct kvm_memory_slot *memslot,
> +			     unsigned int lpid);
>  extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
>  			unsigned long gfn);
>  extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
> @@ -289,6 +306,8 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
>  long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
>  int kvmhv_emulate_priv(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  			unsigned int instr);
> +int kvmhv_handle_nested_trap(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +			     struct task_struct *tsk);
>  
>  void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
>  
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 0c90d56..16c3a97 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -533,6 +533,10 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
>  }
>  #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
>  
> +extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> +			     unsigned long gpa, unsigned int level,
> +			     unsigned long mmu_seq, unsigned int lpid);
> +
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
>  
>  #endif /* __ASM_KVM_BOOK3S_64_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index ceb9f20..64c4807 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -367,7 +367,9 @@ struct kvmppc_pte {
>  	bool may_write		: 1;
>  	bool may_execute	: 1;
>  	unsigned long wimg;
> +	unsigned long rc;
>  	u8 page_size;		/* MMU_PAGE_xxx */
> +	u16 page_shift;
>  };
>  
>  struct kvmppc_mmu {
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index d9357e0..778ae87 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -29,43 +29,16 @@
>   */
>  static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
>  
> -/*
> - * Used to walk a partition or process table radix tree in guest memory
> - * Note: We exploit the fact that a partition table and a process
> - * table have the same layout, a partition-scoped page table and a
> - * process-scoped page table have the same layout, and the 2nd
> - * doubleword of a partition table entry has the same layout as
> - * the PTCR register.
> - */
> -int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
> -				     struct kvmppc_pte *gpte, u64 table,
> -				     int table_index, u64 *pte_ret_p)
> +int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
> +			       struct kvmppc_pte *gpte, u64 root,
> +			       u64 *pte_ret_p)
>  {
>  	struct kvm *kvm = vcpu->kvm;
>  	int ret, level, ps;
> -	unsigned long ptbl, root;
> -	unsigned long rts, bits, offset;
> -	unsigned long size, index;
> -	struct prtb_entry entry;
> +	unsigned long rts, bits, offset, index;
>  	u64 pte, base, gpa;
>  	__be64 rpte;
>  
> -	if ((table & PRTS_MASK) > 24)
> -		return -EINVAL;
> -	size = 1ul << ((table & PRTS_MASK) + 12);
> -
> -	/* Is the table big enough to contain this entry? */
> -	if ((table_index * sizeof(entry)) >= size)
> -		return -EINVAL;
> -
> -	/* Read the table to find the root of the radix tree */
> -	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
> -	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
> -	if (ret)
> -		return ret;
> -
> -	/* Root is stored in the first double word */
> -	root = be64_to_cpu(entry.prtb0);
>  	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
>  		((root & RTS2_MASK) >> RTS2_SHIFT);
>  	bits = root & RPDS_MASK;
> @@ -79,6 +52,7 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  
>  	/* Walk each level of the radix tree */
>  	for (level = 3; level >= 0; --level) {
> +		u64 addr;
>  		/* Check a valid size */
>  		if (level && bits != p9_supported_radix_bits[level])
>  			return -EINVAL;
> @@ -90,10 +64,13 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  		if (base & ((1UL << (bits + 3)) - 1))
>  			return -EINVAL;
>  		/* Read the entry from guest memory */
> -		ret = kvm_read_guest(kvm, base + (index * sizeof(rpte)),
> -				     &rpte, sizeof(rpte));
> -		if (ret)
> +		addr = base + (index * sizeof(rpte));
> +		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
> +		if (ret) {
> +			if (pte_ret_p)
> +				*pte_ret_p = addr;
>  			return ret;
> +		}
>  		pte = __be64_to_cpu(rpte);
>  		if (!(pte & _PAGE_PRESENT))
>  			return -ENOENT;
> @@ -119,6 +96,7 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  		if (offset == mmu_psize_defs[ps].shift)
>  			break;
>  	gpte->page_size = ps;
> +	gpte->page_shift = offset;
>  
>  	gpte->eaddr = eaddr;
>  	gpte->raddr = gpa;
> @@ -128,12 +106,51 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  	gpte->may_write = !!(pte & _PAGE_WRITE);
>  	gpte->may_execute = !!(pte & _PAGE_EXEC);
>  
> +	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
> +
>  	if (pte_ret_p)
>  		*pte_ret_p = pte;
>  
>  	return 0;
>  }
>  
> +/*
> + * Used to walk a partition or process table radix tree in guest memory
> + * Note: We exploit the fact that a partition table and a process
> + * table have the same layout, a partition-scoped page table and a
> + * process-scoped page table have the same layout, and the 2nd
> + * doubleword of a partition table entry has the same layout as
> + * the PTCR register.
> + */
> +int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
> +				     struct kvmppc_pte *gpte, u64 table,
> +				     int table_index, u64 *pte_ret_p)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	int ret;
> +	unsigned long size, ptbl, root;
> +	struct prtb_entry entry;
> +
> +	if ((table & PRTS_MASK) > 24)
> +		return -EINVAL;
> +	size = 1ul << ((table & PRTS_MASK) + 12);
> +
> +	/* Is the table big enough to contain this entry? */
> +	if ((table_index * sizeof(entry)) >= size)
> +		return -EINVAL;
> +
> +	/* Read the table to find the root of the radix tree */
> +	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
> +	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
> +	if (ret)
> +		return ret;
> +
> +	/* Root is stored in the first double word */
> +	root = be64_to_cpu(entry.prtb0);
> +
> +	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
> +}
> +
>  int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			   struct kvmppc_pte *gpte, bool data, bool iswrite)
>  {
> @@ -181,7 +198,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>  }
>  
>  static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
> -				    unsigned int pshift)
> +				    unsigned int pshift, unsigned int lpid)
>  {
>  	unsigned long psize = PAGE_SIZE;
>  
> @@ -189,12 +206,12 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
>  		psize = 1UL << pshift;
>  
>  	addr &= ~(psize - 1);
> -	radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
> +	radix__flush_tlb_lpid_page(lpid, addr, psize);
>  }
>  
> -static void kvmppc_radix_flush_pwc(struct kvm *kvm)
> +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
>  {
> -	radix__flush_pwc_lpid(kvm->arch.lpid);
> +	radix__flush_pwc_lpid(lpid);
>  }
>  
>  static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
> @@ -239,16 +256,17 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
>  	kmem_cache_free(kvm_pmd_cache, pmdp);
>  }
>  
> -static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> -			     unsigned long gpa, unsigned int shift,
> -			     struct kvm_memory_slot *memslot)
> +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> +		      unsigned long gpa, unsigned int shift,
> +		      struct kvm_memory_slot *memslot,
> +		      unsigned int lpid)
>  
>  {
>  	unsigned long old;
>  
>  	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
> -	kvmppc_radix_tlbie_page(kvm, gpa, shift);
> -	if (old & _PAGE_DIRTY) {
> +	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
> +	if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
>  		unsigned long gfn = gpa >> PAGE_SHIFT;
>  		unsigned long page_size = PAGE_SIZE;
>  
> @@ -272,7 +290,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
>   * and emit a warning if encountered, but there may already be data
>   * corruption due to the unexpected mappings.
>   */
> -static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
> +static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
> +				  unsigned int lpid)
>  {
>  	if (full) {
>  		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
> @@ -286,14 +305,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
>  			WARN_ON_ONCE(1);
>  			kvmppc_unmap_pte(kvm, p,
>  					 pte_pfn(*p) << PAGE_SHIFT,
> -					 PAGE_SHIFT, NULL);
> +					 PAGE_SHIFT, NULL, lpid);
>  		}
>  	}
>  
>  	kvmppc_pte_free(pte);
>  }
>  
> -static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
> +static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
> +				  unsigned int lpid)
>  {
>  	unsigned long im;
>  	pmd_t *p = pmd;
> @@ -308,20 +328,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
>  				WARN_ON_ONCE(1);
>  				kvmppc_unmap_pte(kvm, (pte_t *)p,
>  					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
> -					 PMD_SHIFT, NULL);
> +					 PMD_SHIFT, NULL, lpid);
>  			}
>  		} else {
>  			pte_t *pte;
>  
>  			pte = pte_offset_map(p, 0);
> -			kvmppc_unmap_free_pte(kvm, pte, full);
> +			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
>  			pmd_clear(p);
>  		}
>  	}
>  	kvmppc_pmd_free(pmd);
>  }
>  
> -static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
> +static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
> +				  unsigned int lpid)
>  {
>  	unsigned long iu;
>  	pud_t *p = pud;
> @@ -335,36 +356,42 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
>  			pmd_t *pmd;
>  
>  			pmd = pmd_offset(p, 0);
> -			kvmppc_unmap_free_pmd(kvm, pmd, true);
> +			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
>  			pud_clear(p);
>  		}
>  	}
>  	pud_free(kvm->mm, pud);
>  }
>  
> -void kvmppc_free_radix(struct kvm *kvm)
> +void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
>  {
>  	unsigned long ig;
> -	pgd_t *pgd;
>  
> -	if (!kvm->arch.pgtable)
> +	if (!pgd)
>  		return;
> -	pgd = kvm->arch.pgtable;
>  	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
>  		pud_t *pud;
>  
>  		if (!pgd_present(*pgd))
>  			continue;
>  		pud = pud_offset(pgd, 0);
> -		kvmppc_unmap_free_pud(kvm, pud);
> +		kvmppc_unmap_free_pud(kvm, pud, lpid);
>  		pgd_clear(pgd);
>  	}
> -	pgd_free(kvm->mm, kvm->arch.pgtable);
> -	kvm->arch.pgtable = NULL;
> +}
> +
> +void kvmppc_free_radix(struct kvm *kvm)
> +{
> +	if (kvm->arch.pgtable) {
> +		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
> +					  kvm->arch.lpid);
> +		pgd_free(kvm->mm, kvm->arch.pgtable);
> +		kvm->arch.pgtable = NULL;
> +	}
>  }
>  
>  static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
> -					      unsigned long gpa)
> +					unsigned long gpa, unsigned int lpid)
>  {
>  	pte_t *pte = pte_offset_kernel(pmd, 0);
>  
> @@ -374,13 +401,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
>  	 * flushing the PWC again.
>  	 */
>  	pmd_clear(pmd);
> -	kvmppc_radix_flush_pwc(kvm);
> +	kvmppc_radix_flush_pwc(kvm, lpid);
>  
> -	kvmppc_unmap_free_pte(kvm, pte, false);
> +	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
>  }
>  
>  static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
> -					unsigned long gpa)
> +					unsigned long gpa, unsigned int lpid)
>  {
>  	pmd_t *pmd = pmd_offset(pud, 0);
>  
> @@ -390,9 +417,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
>  	 * so can be freed without flushing the PWC again.
>  	 */
>  	pud_clear(pud);
> -	kvmppc_radix_flush_pwc(kvm);
> +	kvmppc_radix_flush_pwc(kvm, lpid);
>  
> -	kvmppc_unmap_free_pmd(kvm, pmd, false);
> +	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
>  }
>  
>  /*
> @@ -404,9 +431,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
>   */
>  #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
>  
> -static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> -			     unsigned long gpa, unsigned int level,
> -			     unsigned long mmu_seq)
> +int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
> +		      unsigned long gpa, unsigned int level,
> +		      unsigned long mmu_seq, unsigned int lpid)
>  {
>  	pgd_t *pgd;
>  	pud_t *pud, *new_pud = NULL;
> @@ -459,7 +486,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
>  							PTE_BITS_MUST_MATCH);
>  			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
> -					      0, pte_val(pte), hgpa, PUD_SHIFT);
> +					0, pte_val(pte), hgpa, PUD_SHIFT);
>  			ret = 0;
>  			goto out_unlock;
>  		}
> @@ -472,7 +499,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			goto out_unlock;
>  		}
>  		/* Valid 1GB page here already, remove it */
> -		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL);
> +		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
> +				 lpid);
>  	}
>  	if (level == 2) {
>  		if (!pud_none(*pud)) {
> @@ -481,7 +509,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			 * install a large page, so remove and free the page
>  			 * table page.
>  			 */
> -			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
> +			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
>  		ret = 0;
> @@ -507,7 +535,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
>  							PTE_BITS_MUST_MATCH);
>  			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
> -					      0, pte_val(pte), lgpa, PMD_SHIFT);
> +					0, pte_val(pte), lgpa, PMD_SHIFT);
>  			ret = 0;
>  			goto out_unlock;
>  		}
> @@ -521,7 +549,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			goto out_unlock;
>  		}
>  		/* Valid 2MB page here already, remove it */
> -		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL);
> +		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
> +				 lpid);
>  	}
>  	if (level == 1) {
>  		if (!pmd_none(*pmd)) {
> @@ -530,7 +559,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			 * install a large page, so remove and free the page
>  			 * table page.
>  			 */
> -			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
> +			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
>  		ret = 0;
> @@ -570,8 +599,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  	return ret;
>  }
>  
> -static bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
> -				    bool writing, unsigned long gpa)
> +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
> +			     unsigned long gpa, unsigned int lpid)
>  {
>  	unsigned long pgflags;
>  	unsigned int shift;
> @@ -598,11 +627,11 @@ static bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
>  	return false;
>  }
>  
> -static int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
> -				unsigned long gpa,
> -				struct kvm_memory_slot *memslot,
> -				bool writing, bool kvm_ro,
> -				pte_t *inserted_pte, unsigned int *levelp)
> +int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
> +				   unsigned long gpa,
> +				   struct kvm_memory_slot *memslot,
> +				   bool writing, bool kvm_ro,
> +				   pte_t *inserted_pte, unsigned int *levelp)
>  {
>  	struct kvm *kvm = vcpu->kvm;
>  	struct page *page = NULL;
> @@ -684,7 +713,7 @@ static int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
>  
>  	/* Allocate space in the tree and write the PTE */
>  	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
> -				mmu_seq);
> +				mmu_seq, kvm->arch.lpid);
>  	if (inserted_pte)
>  		*inserted_pte = pte;
>  	if (levelp)
> @@ -759,7 +788,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	if (dsisr & DSISR_SET_RC) {
>  		spin_lock(&kvm->mmu_lock);
>  		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
> -					    writing, gpa))
> +					    writing, gpa, kvm->arch.lpid))
>  			dsisr &= ~DSISR_SET_RC;
>  		spin_unlock(&kvm->mmu_lock);
>  
> @@ -787,7 +816,8 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
>  
>  	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
>  	if (ptep && pte_present(*ptep))
> -		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot);
> +		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
> +				 kvm->arch.lpid);
>  	return 0;				
>  }
>  
> @@ -842,7 +872,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
>  			ret = 1 << (shift - PAGE_SHIFT);
>  		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
>  					gpa, shift);
> -		kvmppc_radix_tlbie_page(kvm, gpa, shift);
> +		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
>  	}
>  	return ret;
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 93ecf3b..af8066b 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -12,10 +12,13 @@
>  #include <linux/kvm_host.h>
>  
>  #include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
>  #include <asm/mmu.h>
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> +#include <asm/pte-walk.h>
>  #include <asm/disassemble.h>
> +#include <asm/reg.h>
>  
>  static struct patb_entry *pseries_partition_tb;
>  
> @@ -387,10 +390,20 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
>   */
>  static void kvmhv_release_nested(struct kvm_nested_guest *gp)
>  {
> +	struct kvm *kvm = gp->parent;
> +
>  	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
> -	kvmppc_free_lpid(gp->shadow_lpid);
> -	if (gp->shadow_pgtable)
> +	if (gp->shadow_pgtable) {
> +		/*
> +		 * No vcpu is using this struct and no call to
> +		 * kvmhv_remove_nest_rmap can find this struct,
> +		 * so we don't need to hold kvm->mmu_lock.
> +		 */
> +		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
> +					  gp->shadow_lpid);
>  		pgd_free(gp->parent->mm, gp->shadow_pgtable);
> +	}
> +	kvmppc_free_lpid(gp->shadow_lpid);
>  	kfree(gp);
>  }
>  
> @@ -445,6 +458,12 @@ void kvmhv_release_all_nested(struct kvm *kvm)
>  /* caller must hold gp->tlb_lock */
>  void kvmhv_flush_nested(struct kvm_nested_guest *gp)
>  {
> +	struct kvm *kvm = gp->parent;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	kvmppc_free_pgtable_radix(gp->parent, gp->shadow_pgtable,
> +				  gp->shadow_lpid);
> +	spin_unlock(&kvm->mmu_lock);
>  	kvmhv_update_ptbl_cache(gp);
>  	if (gp->l1_gr_to_hr == 0)
>  		kvmhv_remove_nested(gp);
> @@ -511,9 +530,28 @@ struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
>  	return kvm->arch.nested_guests[lpid];
>  }
>  
> -long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
> +static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
> +					struct kvm_nested_guest *gp,
> +					long gpa, int *shift_ret)
>  {
> -	return RESUME_HOST;
> +	struct kvm *kvm = vcpu->kvm;
> +	bool ret = false;
> +	pte_t *ptep;
> +	int shift;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
> +	if (!shift)
> +		shift = PAGE_SHIFT;
> +	if (ptep && pte_present(*ptep)) {
> +		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
> +		ret = true;
> +	}
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (shift_ret)
> +		*shift_ret = shift;
> +	return ret;
>  }
>  
>  static int kvmhv_emulate_priv_mtspr(struct kvm_run *run, struct kvm_vcpu *vcpu,
> @@ -569,3 +607,285 @@ int kvmhv_emulate_priv(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  
>  	return rc;
>  }
> +
> +/* Used to convert a nested guest real address to a L1 guest real address */
> +static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
> +				       struct kvm_nested_guest *gp,
> +				       unsigned long n_gpa, unsigned long dsisr,
> +				       struct kvmppc_pte *gpte_p)
> +{
> +	u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
> +	int ret;
> +
> +	ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
> +					 &fault_addr);
> +
> +	if (ret) {
> +		/* We didn't find a pte */
> +		if (ret == -EINVAL) {
> +			/* Unsupported mmu config */
> +			flags |= DSISR_UNSUPP_MMU;
> +		} else if (ret == -ENOENT) {
> +			/* No translation found */
> +			flags |= DSISR_NOHPTE;
> +		} else if (ret == -EFAULT) {
> +			/* Couldn't access L1 real address */
> +			flags |= DSISR_PRTABLE_FAULT;
> +			vcpu->arch.fault_gpa = fault_addr;
> +		} else {
> +			/* Unknown error */
> +			return ret;
> +		}
> +		goto resume_host;
> +	} else {
> +		/* We found a pte -> check permissions */
> +		if (dsisr & DSISR_ISSTORE) {
> +			/* Can we write? */
> +			if (!gpte_p->may_write) {
> +				flags |= DSISR_PROTFAULT;
> +				goto resume_host;
> +			}
> +		} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
> +			/* Can we execute? */
> +			if (!gpte_p->may_execute) {
> +				flags |= SRR1_ISI_N_OR_G;
> +				goto resume_host;
> +			}
> +		} else {
> +			/* Can we read? */
> +			if (!gpte_p->may_read && !gpte_p->may_write) {
> +				flags |= DSISR_PROTFAULT;
> +				goto resume_host;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +
> +resume_host:
> +	vcpu->arch.fault_dsisr = flags;
> +	if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
> +		vcpu->arch.shregs.msr &= ~0x783f0000ul;
> +		vcpu->arch.shregs.msr |= flags;
> +	}
> +	return RESUME_HOST;
> +}
> +
> +static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
> +				       struct kvm_nested_guest *gp,
> +				       unsigned long n_gpa,
> +				       struct kvmppc_pte gpte,
> +				       unsigned long dsisr)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	bool writing = !!(dsisr & DSISR_ISSTORE);
> +	u64 pgflags;
> +	bool ret;
> +
> +	/* Are the rc bits set in the L1 partition scoped pte? */
> +	pgflags = _PAGE_ACCESSED;
> +	if (writing)
> +		pgflags |= _PAGE_DIRTY;
> +	if (pgflags & ~gpte.rc)
> +		return RESUME_HOST;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
> +	ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
> +				     gpte.raddr, kvm->arch.lpid);
> +	spin_unlock(&kvm->mmu_lock);
> +	if (!ret)
> +		return -EINVAL;
> +
> +	/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
> +	ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
> +				      gp->shadow_lpid);
> +	if (!ret)
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static inline int kvmppc_radix_level_to_shift(int level)
> +{
> +	switch (level) {
> +	case 2:
> +		return PUD_SHIFT;
> +	case 1:
> +		return PMD_SHIFT;
> +	default:
> +		return PAGE_SHIFT;
> +	}
> +}
> +
> +static inline int kvmppc_radix_shift_to_level(int shift)
> +{
> +	if (shift == PUD_SHIFT)
> +		return 2;
> +	else if (shift == PMD_SHIFT)
> +		return 1;
> +	else if (shift == PAGE_SHIFT)
> +		return 0;
> +	else
> +		WARN_ON_ONCE(1);
> +	return 0;
> +}
> +
> +/* called with gp->tlb_lock held */
> +static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
> +					  struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_memory_slot *memslot;
> +	struct kvmppc_pte gpte;
> +	pte_t pte, *pte_p;
> +	unsigned long mmu_seq;
> +	unsigned long dsisr = vcpu->arch.fault_dsisr;
> +	unsigned long ea = vcpu->arch.fault_dar;
> +	unsigned long n_gpa, gpa, gfn, perm = 0UL;
> +	unsigned int shift, l1_shift, level;
> +	bool writing = !!(dsisr & DSISR_ISSTORE);
> +	bool kvm_ro = false;
> +	long int ret;
> +
> +	if (!gp->l1_gr_to_hr) {
> +		kvmhv_update_ptbl_cache(gp);
> +		if (!gp->l1_gr_to_hr)
> +			return RESUME_HOST;
> +	}
> +
> +	/* Convert the nested guest real address into a L1 guest real address */
> +
> +	n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
> +	if (!(dsisr & DSISR_PRTABLE_FAULT))
> +		n_gpa |= ea & 0xFFF;
> +	ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
> +	if (ret == RESUME_HOST && !(dsisr & DSISR_NOHPTE))
> +		/* no pte or prot fault -> remove entry from shadow_pgtable */
> +		goto inval;
> +	if (ret)
> +		return ret;
> +
> +	/* Failed to set the reference/change bits */
> +	if (dsisr & DSISR_SET_RC) {
> +		ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
> +		if (ret == RESUME_HOST)
> +			return ret;
> +		if (ret)
> +			goto inval;
> +		dsisr &= ~DSISR_SET_RC;
> +		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
> +			       DSISR_PROTFAULT)))
> +			return RESUME_GUEST;
> +	}
> +
> +	/*
> +	 * We took an HISI or HDSI while we were running a nested guest which
> +	 * means we have no partition scoped translation for that. This means
> +	 * we need to insert a pte for the mapping into our shadow_pgtable.
> +	 */
> +
> +	l1_shift = gpte.page_shift;
> +	if (l1_shift < PAGE_SHIFT) {
> +		/* We don't support l1 using a page size smaller than our own */
> +		pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
> +			l1_shift, PAGE_SHIFT);

That's a reasonable limitation, but you should probably add it to the
list of known limitations.

> +		return -EINVAL;
> +	}
> +	gpa = gpte.raddr;
> +	gfn = gpa >> PAGE_SHIFT;
> +
> +	/* 1. Get the corresponding host memslot */
> +
> +	memslot = gfn_to_memslot(kvm, gfn);
> +	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
> +		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
> +			/* unusual error -> reflect to the guest as a DSI */
> +			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
> +			return RESUME_GUEST;
> +		}
> +		/* passthrough of emulated MMIO case... */
> +		pr_err("emulated MMIO passthrough?\n");
> +		return -EINVAL;
> +	}
> +	if (memslot->flags & KVM_MEM_READONLY) {
> +		if (writing) {
> +			/* Give the guest a DSI */
> +			kvmppc_core_queue_data_storage(vcpu, ea,
> +					DSISR_ISSTORE | DSISR_PROTFAULT);
> +			return RESUME_GUEST;
> +		}
> +		kvm_ro = true;
> +	}
> +
> +	/* 2. Find the host pte for this L1 guest real address */
> +
> +	/* Used to check for invalidations in progress */
> +	mmu_seq = kvm->mmu_notifier_seq;
> +	smp_rmb();
> +
> +	/* See if can find translation in our partition scoped tables for L1 */
> +	pte = __pte(0);
> +	spin_lock(&kvm->mmu_lock);
> +	pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
> +	if (!shift)
> +		shift = PAGE_SHIFT;
> +	if (pte_p)
> +		pte = *pte_p;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
> +		/* No suitable pte found -> try to insert a mapping */
> +		ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
> +					writing, kvm_ro, &pte, &level);
> +		if (ret == -EAGAIN)
> +			return RESUME_GUEST;
> +		else if (ret)
> +			return ret;
> +		shift = kvmppc_radix_level_to_shift(level);
> +	}
> +
> +	/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
> +
> +	/* The permissions is the combination of the host and l1 guest ptes */
> +	perm |= gpte.may_read ? 0UL : _PAGE_READ;
> +	perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
> +	perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
> +	pte = __pte(pte_val(pte) & ~perm);
> +
> +	/* What size pte can we insert? */
> +	if (shift > l1_shift) {
> +		u64 mask;
> +		unsigned int actual_shift = PAGE_SHIFT;
> +		if (PMD_SHIFT < l1_shift)
> +			actual_shift = PMD_SHIFT;
> +		mask = (1UL << shift) - (1UL << actual_shift);
> +		pte = __pte(pte_val(pte) | (gpa & mask));
> +		shift = actual_shift;
> +	}
> +	level = kvmppc_radix_shift_to_level(shift);
> +	n_gpa &= ~((1UL << shift) - 1);
> +
> +	/* 4. Insert the pte into our shadow_pgtable */
> +
> +	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
> +				mmu_seq, gp->shadow_lpid);
> +	if (ret == -EAGAIN)
> +		ret = RESUME_GUEST;	/* Let the guest try again */
> +
> +	return ret;
> +
> + inval:
> +	kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
> +	return RESUME_GUEST;
> +}
> +
> +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_nested_guest *gp = vcpu->arch.nested;
> +	long int ret;
> +
> +	mutex_lock(&gp->tlb_lock);
> +	ret = __kvmhv_nested_page_fault(vcpu, gp);
> +	mutex_unlock(&gp->tlb_lock);
> +	return ret;
> +}
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index fef3e1e..4c4dfc4 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
>  /*
>   * Flush partition scoped translations from LPID (=LPIDR)
>   */
> +void radix__flush_tlb_lpid(unsigned int lpid)
> +{
> +	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
> +}
> +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
> +
> +/*
> + * Flush partition scoped translations from LPID (=LPIDR)
> + */
>  void radix__local_flush_tlb_lpid(unsigned int lpid)
>  {
>  	_tlbiel_lpid(lpid, RIC_FLUSH_ALL);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [KVM Development]     [KVM ARM]     [KVM ia64]     [Linux Virtualization]     [Linux USB Devel]     [Linux Video]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Big List of Linux Books]

  Powered by Linux