On Mon, Oct 08, 2018 at 04:31:08PM +1100, Paul Mackerras wrote: > From: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx> > > When a host (L0) page which is mapped into a (L1) guest is in turn > mapped through to a nested (L2) guest we keep a reverse mapping (rmap) > so that these mappings can be retrieved later. > > Whenever we create an entry in a shadow_pgtable for a nested guest we > create a corresponding rmap entry and add it to the list for the > L1 guest memslot at the index of the L1 guest page it maps. This means > at the L1 guest memslot we end up with lists of rmaps. > > When we are notified of a host page being invalidated which has been > mapped through to a (L1) guest, we can then walk the rmap list for that > guest page, and find and invalidate all of the corresponding > shadow_pgtable entries. > > In order to reduce memory consumption, we compress the information for > each rmap entry down to 52 bits -- 12 bits for the LPID and 40 bits > for the guest real page frame number -- which will fit in a single > unsigned long. To avoid a scenario where a guest can trigger > unbounded memory allocations, we scan the list when adding an entry to > see if there is already an entry with the contents we need. This can > occur, because we don't ever remove entries from the middle of a list. > > A struct nested guest rmap is a list pointer and an rmap entry; > ---------------- > | next pointer | > ---------------- > | rmap entry | > ---------------- > > Thus the rmap pointer for each guest frame number in the memslot can be > either NULL, a single entry, or a pointer to a list of nested rmap entries. > > gfn memslot rmap array > ------------------------- > 0 | NULL | (no rmap entry) > ------------------------- > 1 | single rmap entry | (rmap entry with low bit set) > ------------------------- > 2 | list head pointer | (list of rmap entries) > ------------------------- > > The final entry always has the lowest bit set and is stored in the next > pointer of the last list entry, or as a single rmap entry. > With a list of rmap entries looking like; > > ----------------- ----------------- ------------------------- > | list head ptr | ----> | next pointer | ----> | single rmap entry | > ----------------- ----------------- ------------------------- > | rmap entry | | rmap entry | > ----------------- ------------------------- > > Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx> > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_book3s.h | 3 + > arch/powerpc/include/asm/kvm_book3s_64.h | 69 +++++++++++++++- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 44 +++++++--- > arch/powerpc/kvm/book3s_hv.c | 1 + > arch/powerpc/kvm/book3s_hv_nested.c | 138 ++++++++++++++++++++++++++++++- > 5 files changed, 240 insertions(+), 15 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h > index 63f7ccf..d7aeb6f 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -196,6 +196,9 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, > int table_index, u64 *pte_ret_p); > extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, > struct kvmppc_pte *gpte, bool data, bool iswrite); > +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, > + unsigned int shift, struct kvm_memory_slot *memslot, > + unsigned int lpid); > extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, > bool writing, unsigned long gpa, > unsigned int lpid); > diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h > index 5496152..c2a9146 100644 > --- a/arch/powerpc/include/asm/kvm_book3s_64.h > +++ b/arch/powerpc/include/asm/kvm_book3s_64.h > @@ -53,6 +53,66 @@ struct kvm_nested_guest { > struct kvm_nested_guest *next; > }; > > +/* > + * We define a nested rmap entry as a single 64-bit quantity > + * 0xFFF0000000000000 12-bit lpid field > + * 0x000FFFFFFFFFF000 40-bit guest 4k page frame number > + * 0x0000000000000001 1-bit single entry flag > + */ > +#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL > +#define RMAP_NESTED_LPID_SHIFT (52) > +#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL > +#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL > + > +/* Structure for a nested guest rmap entry */ > +struct rmap_nested { > + struct llist_node list; > + u64 rmap; > +}; > + > +/* > + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries > + * safe against removal of the list entry or NULL list > + * @pos: a (struct rmap_nested *) to use as a loop cursor > + * @node: pointer to the first entry > + * NOTE: this can be NULL > + * @rmapp: an (unsigned long *) in which to return the rmap entries on each > + * iteration > + * NOTE: this must point to already allocated memory > + * > + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the > + * rmap entry in the memslot. The list is always terminated by a "single entry" > + * stored in the list element of the final entry of the llist. If there is ONLY > + * a single entry then this is itself in the rmap entry of the memslot, not a > + * llist head pointer. > + * > + * Note that the iterator below assumes that a nested rmap entry is always > + * non-zero. This is true for our usage because the LPID field is always > + * non-zero (zero is reserved for the host). > + * > + * This should be used to iterate over the list of rmap_nested entries with > + * processing done on the u64 rmap value given by each iteration. This is safe > + * against removal of list entries and it is always safe to call free on (pos). > + * > + * e.g. > + * struct rmap_nested *cursor; > + * struct llist_node *first; > + * unsigned long rmap; > + * for_each_nest_rmap_safe(cursor, first, &rmap) { > + * do_something(rmap); > + * free(cursor); > + * } > + */ > +#define for_each_nest_rmap_safe(pos, node, rmapp) \ > + for ((pos) = llist_entry((node), typeof(*(pos)), list); \ > + (node) && \ > + (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ > + ((u64) (node)) : ((pos)->rmap))) && \ > + (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ > + ((struct llist_node *) ((pos) = NULL)) : \ > + (pos)->list.next)), true); \ > + (pos) = llist_entry((node), typeof(*(pos)), list)) > + > struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, > bool create); > void kvmhv_put_nested(struct kvm_nested_guest *gp); > @@ -551,7 +611,14 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) > > extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, > unsigned long gpa, unsigned int level, > - unsigned long mmu_seq, unsigned int lpid); > + unsigned long mmu_seq, unsigned int lpid, > + unsigned long *rmapp, struct rmap_nested **n_rmap); > +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, > + struct rmap_nested **n_rmap); > +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, > + struct kvm_memory_slot *memslot, > + unsigned long gpa, unsigned long hpa, > + unsigned long nbytes); > > #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index c4b1a9e..4c1eccb 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -256,27 +256,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp) > kmem_cache_free(kvm_pmd_cache, pmdp); > } > > -void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, > - unsigned long gpa, unsigned int shift, > - struct kvm_memory_slot *memslot, > +/* Called with kvm->mmu_lock held */ > +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, > + unsigned int shift, struct kvm_memory_slot *memslot, > unsigned int lpid) > > { > unsigned long old; > + unsigned long gfn = gpa >> PAGE_SHIFT; > + unsigned long page_size = PAGE_SIZE; > + unsigned long hpa; > > old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); > kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); > - if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) { > - unsigned long gfn = gpa >> PAGE_SHIFT; > - unsigned long page_size = PAGE_SIZE; > > - if (shift) > - page_size = 1ul << shift; > + /* The following only applies to L1 entries */ > + if (lpid != kvm->arch.lpid) > + return; > + > + if (!memslot) { > + memslot = gfn_to_memslot(kvm, gfn); > if (!memslot) > - memslot = gfn_to_memslot(kvm, gfn); > - if (memslot && memslot->dirty_bitmap) > - kvmppc_update_dirty_map(memslot, gfn, page_size); > + return; > } > + if (shift) > + page_size = 1ul << shift; > + > + gpa &= ~(page_size - 1); > + hpa = old & PTE_RPN_MASK; > + kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); > + > + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) > + kvmppc_update_dirty_map(memslot, gfn, page_size); > } > > /* > @@ -430,7 +441,8 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, > > int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, > unsigned long gpa, unsigned int level, > - unsigned long mmu_seq, unsigned int lpid) > + unsigned long mmu_seq, unsigned int lpid, > + unsigned long *rmapp, struct rmap_nested **n_rmap) > { > pgd_t *pgd; > pud_t *pud, *new_pud = NULL; > @@ -509,6 +521,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, > kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); > } > kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); > + if (rmapp && n_rmap) > + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); > ret = 0; > goto out_unlock; > } > @@ -559,6 +573,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, > kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); > } > kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); > + if (rmapp && n_rmap) > + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); > ret = 0; > goto out_unlock; > } > @@ -583,6 +599,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, > goto out_unlock; > } > kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); > + if (rmapp && n_rmap) > + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); > ret = 0; > > out_unlock: > @@ -710,7 +728,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, > > /* Allocate space in the tree and write the PTE */ > ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, > - mmu_seq, kvm->arch.lpid); > + mmu_seq, kvm->arch.lpid, NULL, NULL); > if (inserted_pte) > *inserted_pte = pte; > if (levelp) > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index dc25461..cb9e738 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4482,6 +4482,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) > kvmppc_free_hpt(&kvm->arch.hpt); > kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, > LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); > + kvmppc_rmap_reset(kvm); > kvm->arch.radix = 1; > return 0; > } > diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c > index 21a210c..3fa676b 100644 > --- a/arch/powerpc/kvm/book3s_hv_nested.c > +++ b/arch/powerpc/kvm/book3s_hv_nested.c > @@ -10,6 +10,7 @@ > > #include <linux/kernel.h> > #include <linux/kvm_host.h> > +#include <linux/llist.h> > > #include <asm/kvm_ppc.h> > #include <asm/kvm_book3s.h> > @@ -22,6 +23,7 @@ > static struct patb_entry *pseries_partition_tb; > > static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); > +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free); > > void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) > { > @@ -456,6 +458,8 @@ void kvmhv_release_all_nested(struct kvm *kvm) > int i; > struct kvm_nested_guest *gp; > struct kvm_nested_guest *freelist = NULL; > + struct kvm_memory_slot *memslot; > + int srcu_idx; > > spin_lock(&kvm->mmu_lock); > for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { > @@ -474,6 +478,11 @@ void kvmhv_release_all_nested(struct kvm *kvm) > freelist = gp->next; > kvmhv_release_nested(gp); > } > + > + srcu_idx = srcu_read_lock(&kvm->srcu); > + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) > + kvmhv_free_memslot_nest_rmap(memslot); > + srcu_read_unlock(&kvm->srcu, srcu_idx); > } > > /* caller must hold gp->tlb_lock */ > @@ -544,6 +553,123 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp) > kvmhv_release_nested(gp); > } > > +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) > +{ > + if (lpid > kvm->arch.max_nested_lpid) > + return NULL; > + return kvm->arch.nested_guests[lpid]; > +} > + > +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) > +{ > + return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | > + RMAP_NESTED_GPA_MASK)); > +} > + > +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, > + struct rmap_nested **n_rmap) > +{ > + struct llist_node *entry = ((struct llist_head *) rmapp)->first; > + struct rmap_nested *cursor; > + u64 rmap, new_rmap = (*n_rmap)->rmap; > + > + /* Are there any existing entries? */ > + if (!(*rmapp)) { > + /* No -> use the rmap as a single entry */ > + *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY; > + return; > + } > + > + /* Do any entries match what we're trying to insert? */ > + for_each_nest_rmap_safe(cursor, entry, &rmap) { > + if (kvmhv_n_rmap_is_equal(rmap, new_rmap)) > + return; > + } > + > + /* Do we need to create a list or just add the new entry? */ > + rmap = *rmapp; > + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ > + *rmapp = 0UL; > + llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp); > + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ > + (*n_rmap)->list.next = (struct llist_node *) rmap; > + > + /* Set NULL so not freed by caller */ > + *n_rmap = NULL; > +} > + > +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, > + unsigned long hpa, unsigned long mask) > +{ > + struct kvm_nested_guest *gp; > + unsigned long gpa; > + unsigned int shift, lpid; > + pte_t *ptep; > + > + gpa = n_rmap & RMAP_NESTED_GPA_MASK; > + lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; > + gp = kvmhv_find_nested(kvm, lpid); > + if (!gp) > + return; > + > + /* Find and invalidate the pte */ > + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); > + /* Don't spuriously invalidate ptes if the pfn has changed */ > + if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) > + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); > +} > + > +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, > + unsigned long hpa, unsigned long mask) > +{ > + struct llist_node *entry = llist_del_all((struct llist_head *) rmapp); > + struct rmap_nested *cursor; > + unsigned long rmap; > + > + for_each_nest_rmap_safe(cursor, entry, &rmap) { > + kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask); > + kfree(cursor); > + } > +} > + > +/* called with kvm->mmu_lock held */ > +void kvmhv_remove_nest_rmap_range(struct kvm *kvm, > + struct kvm_memory_slot *memslot, > + unsigned long gpa, unsigned long hpa, > + unsigned long nbytes) > +{ > + unsigned long gfn, end_gfn; > + unsigned long addr_mask; > + > + if (!memslot) > + return; > + gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn; > + end_gfn = gfn + (nbytes >> PAGE_SHIFT); > + > + addr_mask = PTE_RPN_MASK & ~(nbytes - 1); > + hpa &= addr_mask; > + > + for (; gfn < end_gfn; gfn++) { > + unsigned long *rmap = &memslot->arch.rmap[gfn]; > + kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask); > + } > +} > + > +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free) > +{ > + unsigned long page; > + > + for (page = 0; page < free->npages; page++) { > + unsigned long rmap, *rmapp = &free->arch.rmap[page]; > + struct rmap_nested *cursor; > + struct llist_node *entry; > + > + entry = llist_del_all((struct llist_head *) rmapp); > + for_each_nest_rmap_safe(cursor, entry, &rmap) > + kfree(cursor); > + } > +} > + > static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, > struct kvm_nested_guest *gp, > long gpa, int *shift_ret) > @@ -695,11 +821,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, > { > struct kvm *kvm = vcpu->kvm; > struct kvm_memory_slot *memslot; > + struct rmap_nested *n_rmap; > struct kvmppc_pte gpte; > pte_t pte, *pte_p; > unsigned long mmu_seq; > unsigned long dsisr = vcpu->arch.fault_dsisr; > unsigned long ea = vcpu->arch.fault_dar; > + unsigned long *rmapp; > unsigned long n_gpa, gpa, gfn, perm = 0UL; > unsigned int shift, l1_shift, level; > bool writing = !!(dsisr & DSISR_ISSTORE); > @@ -833,8 +961,16 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, > > /* 4. Insert the pte into our shadow_pgtable */ > > + n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL); > + if (!n_rmap) > + return RESUME_GUEST; /* Let the guest try again */ > + n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) | > + (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT); > + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; > ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, > - mmu_seq, gp->shadow_lpid); > + mmu_seq, gp->shadow_lpid, rmapp, &n_rmap); > + if (n_rmap) > + kfree(n_rmap); > if (ret == -EAGAIN) > ret = RESUME_GUEST; /* Let the guest try again */ > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature