Re: [PATCH v4 22/32] KVM: PPC: Book3S HV: Introduce rmap to track nested guest mappings

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 04, 2018 at 09:55:59PM +1000, Paul Mackerras wrote:
> From: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx>
> 
> When a host (L0) page which is mapped into a (L1) guest is in turn
> mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
> so that these mappings can be retrieved later.
> 
> Whenever we create an entry in a shadow_pgtable for a nested guest we
> create a corresponding rmap entry and add it to the list for the
> L1 guest memslot at the index of the L1 guest page it maps. This means
> at the L1 guest memslot we end up with lists of rmaps.
> 
> When we are notified of a host page being invalidated which has been
> mapped through to a (L1) guest, we can then walk the rmap list for that
> guest page, and find and invalidate all of the corresponding
> shadow_pgtable entries.
> 
> In order to reduce memory consumption, we compress the information for
> each rmap entry down to 52 bits -- 12 bits for the LPID and 40 bits
> for the guest real page frame number -- which will fit in a single
> unsigned long.  To avoid a scenario where a guest can trigger
> unbounded memory allocations, we scan the list when adding an entry to
> see if there is already an entry with the contents we need.  This can
> occur, because we don't ever remove entries from the middle of a list.
> 
> A struct nested guest rmap is a list pointer and an rmap entry;
> ----------------
> | next pointer |
> ----------------
> | rmap entry   |
> ----------------
> 
> Thus the rmap pointer for each guest frame number in the memslot can be
> either NULL, a single entry, or a pointer to a list of nested rmap entries.
> 
> gfn	 memslot rmap array
>  	-------------------------
>  0	| NULL			|	(no rmap entry)
>  	-------------------------
>  1	| single rmap entry	|	(rmap entry with low bit set)
>  	-------------------------
>  2	| list head pointer	|	(list of rmap entries)
>  	-------------------------
> 
> The final entry always has the lowest bit set and is stored in the next
> pointer of the last list entry, or as a single rmap entry.
> With a list of rmap entries looking like;
> 
> -----------------	-----------------	-------------------------
> | list head ptr	| ----> | next pointer	| ---->	| single rmap entry	|
> -----------------	-----------------	-------------------------
> 			| rmap entry	|	| rmap entry		|
> 			-----------------	-------------------------
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx>
> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx>

Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>

> ---
>  arch/powerpc/include/asm/kvm_book3s.h    |   3 +
>  arch/powerpc/include/asm/kvm_book3s_64.h |  70 ++++++++++++++++-
>  arch/powerpc/kvm/book3s_64_mmu_radix.c   |  44 +++++++----
>  arch/powerpc/kvm/book3s_hv.c             |   1 +
>  arch/powerpc/kvm/book3s_hv_nested.c      | 130 ++++++++++++++++++++++++++++++-
>  5 files changed, 233 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 63f7ccf..d7aeb6f 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -196,6 +196,9 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			int table_index, u64 *pte_ret_p);
>  extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>  			struct kvmppc_pte *gpte, bool data, bool iswrite);
> +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +			unsigned int shift, struct kvm_memory_slot *memslot,
> +			unsigned int lpid);
>  extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
>  				    bool writing, unsigned long gpa,
>  				    unsigned int lpid);
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 5496152..a02f0b3 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -53,6 +53,66 @@ struct kvm_nested_guest {
>  	struct kvm_nested_guest *next;
>  };
>  
> +/*
> + * We define a nested rmap entry as a single 64-bit quantity
> + * 0xFFF0000000000000	12-bit lpid field
> + * 0x000FFFFFFFFFF000	40-bit guest 4k page frame number
> + * 0x0000000000000001	1-bit  single entry flag
> + */
> +#define RMAP_NESTED_LPID_MASK		0xFFF0000000000000UL
> +#define RMAP_NESTED_LPID_SHIFT		(52)
> +#define RMAP_NESTED_GPA_MASK		0x000FFFFFFFFFF000UL
> +#define RMAP_NESTED_IS_SINGLE_ENTRY	0x0000000000000001UL
> +
> +/* Structure for a nested guest rmap entry */
> +struct rmap_nested {
> +	struct llist_node list;
> +	u64 rmap;
> +};
> +
> +/*
> + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
> + *			     safe against removal of the list entry or NULL list
> + * @pos:	a (struct rmap_nested *) to use as a loop cursor
> + * @node:	pointer to the first entry
> + *		NOTE: this can be NULL
> + * @rmapp:	an (unsigned long *) in which to return the rmap entries on each
> + *		iteration
> + *		NOTE: this must point to already allocated memory
> + *
> + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
> + * rmap entry in the memslot. The list is always terminated by a "single entry"
> + * stored in the list element of the final entry of the llist. If there is ONLY
> + * a single entry then this is itself in the rmap entry of the memslot, not a
> + * llist head pointer.
> + *
> + * Note that the iterator below assumes that a nested rmap entry is always
> + * non-zero.  This is true for our usage because the LPID field is always
> + * non-zero (zero is reserved for the host).
> + *
> + * This should be used to iterate over the list of rmap_nested entries with
> + * processing done on the u64 rmap value given by each iteration. This is safe
> + * against removal of list entries and it is always safe to call free on (pos).
> + *
> + * e.g.
> + * struct rmap_nested *cursor;
> + * struct llist_node *first;
> + * unsigned long rmap;
> + * for_each_nest_rmap_safe(cursor, first, &rmap) {
> + *	do_something(rmap);
> + *	free(cursor);
> + * }
> + */
> +#define for_each_nest_rmap_safe(pos, node, rmapp)			       \
> +	for ((pos) = llist_entry((node), typeof(*(pos)), list);		       \
> +	     (node) &&							       \
> +	     (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
> +			  ((u64) (node)) : ((pos)->rmap))) &&		       \
> +	     (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
> +			 ((struct llist_node *) ((pos) = NULL)) :	       \
> +			 (pos)->list.next)), true);			       \
> +	     (pos) = llist_entry((node), typeof(*(pos)), list))
> +
>  struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
>  					  bool create);
>  void kvmhv_put_nested(struct kvm_nested_guest *gp);
> @@ -551,7 +611,15 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
>  
>  extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			     unsigned long gpa, unsigned int level,
> -			     unsigned long mmu_seq, unsigned int lpid);
> +			     unsigned long mmu_seq, unsigned int lpid,
> +			     unsigned long *rmapp, struct rmap_nested **n_rmap);
> +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
> +				   struct rmap_nested **n_rmap);
> +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +				struct kvm_memory_slot *memslot,
> +				unsigned long gpa, unsigned long hpa,
> +				unsigned long nbytes);
> +extern void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
>  
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
>  
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index c4b1a9e..4c1eccb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -256,27 +256,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
>  	kmem_cache_free(kvm_pmd_cache, pmdp);
>  }
>  
> -void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> -		      unsigned long gpa, unsigned int shift,
> -		      struct kvm_memory_slot *memslot,
> +/* Called with kvm->mmu_lock held */
> +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +		      unsigned int shift, struct kvm_memory_slot *memslot,
>  		      unsigned int lpid)
>  
>  {
>  	unsigned long old;
> +	unsigned long gfn = gpa >> PAGE_SHIFT;
> +	unsigned long page_size = PAGE_SIZE;
> +	unsigned long hpa;
>  
>  	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
>  	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
> -	if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
> -		unsigned long gfn = gpa >> PAGE_SHIFT;
> -		unsigned long page_size = PAGE_SIZE;
>  
> -		if (shift)
> -			page_size = 1ul << shift;
> +	/* The following only applies to L1 entries */
> +	if (lpid != kvm->arch.lpid)
> +		return;
> +
> +	if (!memslot) {
> +		memslot = gfn_to_memslot(kvm, gfn);
>  		if (!memslot)
> -			memslot = gfn_to_memslot(kvm, gfn);
> -		if (memslot && memslot->dirty_bitmap)
> -			kvmppc_update_dirty_map(memslot, gfn, page_size);
> +			return;
>  	}
> +	if (shift)
> +		page_size = 1ul << shift;
> +
> +	gpa &= ~(page_size - 1);
> +	hpa = old & PTE_RPN_MASK;
> +	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
> +
> +	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
> +		kvmppc_update_dirty_map(memslot, gfn, page_size);
>  }
>  
>  /*
> @@ -430,7 +441,8 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
>  
>  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  		      unsigned long gpa, unsigned int level,
> -		      unsigned long mmu_seq, unsigned int lpid)
> +		      unsigned long mmu_seq, unsigned int lpid,
> +		      unsigned long *rmapp, struct rmap_nested **n_rmap)
>  {
>  	pgd_t *pgd;
>  	pud_t *pud, *new_pud = NULL;
> @@ -509,6 +521,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
> +		if (rmapp && n_rmap)
> +			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>  		ret = 0;
>  		goto out_unlock;
>  	}
> @@ -559,6 +573,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
>  		}
>  		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
> +		if (rmapp && n_rmap)
> +			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>  		ret = 0;
>  		goto out_unlock;
>  	}
> @@ -583,6 +599,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>  		goto out_unlock;
>  	}
>  	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
> +	if (rmapp && n_rmap)
> +		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>  	ret = 0;
>  
>   out_unlock:
> @@ -710,7 +728,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
>  
>  	/* Allocate space in the tree and write the PTE */
>  	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
> -				mmu_seq, kvm->arch.lpid);
> +				mmu_seq, kvm->arch.lpid, NULL, NULL);
>  	if (inserted_pte)
>  		*inserted_pte = pte;
>  	if (levelp)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 134d7c7..2d8209a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4269,6 +4269,7 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
>  					struct kvm_memory_slot *dont)
>  {
>  	if (!dont || free->arch.rmap != dont->arch.rmap) {
> +		kvmhv_free_memslot_nest_rmap(free);
>  		vfree(free->arch.rmap);
>  		free->arch.rmap = NULL;
>  	}
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 9c04242..3947aa5 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -10,6 +10,7 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/kvm_host.h>
> +#include <linux/llist.h>
>  
>  #include <asm/kvm_ppc.h>
>  #include <asm/kvm_book3s.h>
> @@ -541,6 +542,123 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp)
>  		kvmhv_release_nested(gp);
>  }
>  
> +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
> +{
> +	if (lpid > kvm->arch.max_nested_lpid)
> +		return NULL;
> +	return kvm->arch.nested_guests[lpid];
> +}
> +
> +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
> +{
> +	return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
> +				       RMAP_NESTED_GPA_MASK));
> +}
> +
> +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
> +			    struct rmap_nested **n_rmap)
> +{
> +	struct llist_node *entry = ((struct llist_head *) rmapp)->first;
> +	struct rmap_nested *cursor;
> +	u64 rmap, new_rmap = (*n_rmap)->rmap;
> +
> +	/* Are there any existing entries? */
> +	if (!(*rmapp)) {
> +		/* No -> use the rmap as a single entry */
> +		*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
> +		return;
> +	}
> +
> +	/* Do any entries match what we're trying to insert? */
> +	for_each_nest_rmap_safe(cursor, entry, &rmap) {
> +		if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
> +			return;
> +	}
> +
> +	/* Do we need to create a list or just add the new entry? */
> +	rmap = *rmapp;
> +	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
> +		*rmapp = 0UL;
> +	llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
> +	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
> +		(*n_rmap)->list.next = (struct llist_node *) rmap;
> +
> +	/* Set NULL so not freed by caller */
> +	*n_rmap = NULL;
> +}
> +
> +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
> +				   unsigned long hpa, unsigned long mask)
> +{
> +	struct kvm_nested_guest *gp;
> +	unsigned long gpa;
> +	unsigned int shift, lpid;
> +	pte_t *ptep;
> +
> +	gpa = n_rmap & RMAP_NESTED_GPA_MASK;
> +	lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
> +	gp = kvmhv_find_nested(kvm, lpid);
> +	if (!gp)
> +		return;
> +
> +	/* Find and invalidate the pte */
> +	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
> +	/* Don't spuriously invalidate ptes if the pfn has changed */
> +	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
> +		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
> +}
> +
> +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
> +					unsigned long hpa, unsigned long mask)
> +{
> +	struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
> +	struct rmap_nested *cursor;
> +	unsigned long rmap;
> +
> +	for_each_nest_rmap_safe(cursor, entry, &rmap) {
> +		kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
> +		kfree(cursor);
> +	}
> +}
> +
> +/* called with kvm->mmu_lock held */
> +void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +				  struct kvm_memory_slot *memslot,
> +				  unsigned long gpa, unsigned long hpa,
> +				  unsigned long nbytes)
> +{
> +	unsigned long gfn, end_gfn;
> +	unsigned long addr_mask;
> +
> +	if (!memslot)
> +		return;
> +	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
> +	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
> +
> +	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
> +	hpa &= addr_mask;
> +
> +	for (; gfn < end_gfn; gfn++) {
> +		unsigned long *rmap = &memslot->arch.rmap[gfn];
> +		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
> +	}
> +}
> +
> +void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
> +{
> +	unsigned long page;
> +
> +	for (page = 0; page < free->npages; page++) {
> +		unsigned long rmap, *rmapp = &free->arch.rmap[page];
> +		struct rmap_nested *cursor;
> +		struct llist_node *entry;
> +
> +		entry = llist_del_all((struct llist_head *) rmapp);
> +		for_each_nest_rmap_safe(cursor, entry, &rmap)
> +			kfree(cursor);
> +	}
> +}
> +
>  static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
>  					struct kvm_nested_guest *gp,
>  					long gpa, int *shift_ret)
> @@ -692,11 +810,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
>  {
>  	struct kvm *kvm = vcpu->kvm;
>  	struct kvm_memory_slot *memslot;
> +	struct rmap_nested *n_rmap;
>  	struct kvmppc_pte gpte;
>  	pte_t pte, *pte_p;
>  	unsigned long mmu_seq;
>  	unsigned long dsisr = vcpu->arch.fault_dsisr;
>  	unsigned long ea = vcpu->arch.fault_dar;
> +	unsigned long *rmapp;
>  	unsigned long n_gpa, gpa, gfn, perm = 0UL;
>  	unsigned int shift, l1_shift, level;
>  	bool writing = !!(dsisr & DSISR_ISSTORE);
> @@ -830,8 +950,16 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
>  
>  	/* 4. Insert the pte into our shadow_pgtable */
>  
> +	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
> +	if (!n_rmap)
> +		return RESUME_GUEST; /* Let the guest try again */
> +	n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
> +		(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
> +	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
>  	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
> -				mmu_seq, gp->shadow_lpid);
> +				mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
> +	if (n_rmap)
> +		kfree(n_rmap);
>  	if (ret == -EAGAIN)
>  		ret = RESUME_GUEST;	/* Let the guest try again */
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux