Iteration using rmap_next(), the actual body is pte_list_next(), is inefficient: every time we call it we start from checking whether rmap holds a single spte or points to a descriptor which links more sptes. In the case of shadow paging, this quadratic total iteration cost is a problem. Even for two dimensional paging, with EPT/NPT on, in which we almost always have a single spte, the extra checks at the end of the iteration should be eliminated. This patch fixes this by introducing rmap_iterator which keeps the iteration context for the next search. Furthermore the implementation of rmap_next() is splitted into two functions - rmap_get_first() and rmap_get_next() - to avoid repeatedly checking whether the rmap being iterated on has only one spte. Note: we just remove pte_list_next() because we can think of parent_ptes as a reverse mapping. Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@xxxxxxxxxxxxx> --- arch/x86/kvm/mmu.c | 198 ++++++++++++++++++++++++++++------------------ arch/x86/kvm/mmu_audit.c | 8 +- 2 files changed, 124 insertions(+), 82 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 384d3c0..d042087 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, return count; } -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static void pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) @@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; @@ -1005,6 +974,72 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } +/* + * Used by the following functions to iterate over the sptes linked by a rmap. + * Only sptep can be used outside of these functions. + */ +struct rmap_iterator { + u64 *sptep; /* points to the current spte */ + /* private fields */ + struct pte_list_desc *desc; /* holds the sptep if not NULL */ + int pos; /* index of the sptep */ +}; + +/* + * Iteration must be started by this function. This should also be used after + * removing/dropping sptes from rmap because in such cases the information in + * the itererator may not be valid. + * + * Returns true if spte is found, false otherwise. + */ +static bool rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +{ + if (!rmap) { + iter->sptep = NULL; + return false; + } + + if (!(rmap & 1)) { + iter->sptep = (u64 *)rmap; + iter->desc = NULL; + } else { + iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->pos = 0; + iter->sptep = iter->desc->sptes[iter->pos]; + } + + return true; +} + +/* + * Must be used with a valid iterator: e.g. after rmap_get_first(). + * + * Returns true if spte is found, false otherwise. + */ +static bool rmap_get_next(struct rmap_iterator *iter) +{ + if (iter->desc) { + if (iter->pos < PTE_LIST_EXT - 1) { + ++iter->pos; + iter->sptep = iter->desc->sptes[iter->pos]; + if (iter->sptep) + return true; + } + + iter->desc = iter->desc->more; + + if (iter->desc) { + iter->pos = 0; + iter->sptep = iter->desc->sptes[iter->pos]; + /* desc->sptes[0] cannot be NULL */ + return true; + } + } + + iter->sptep = NULL; + return false; +} + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1013,23 +1048,28 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - u64 *spte = NULL; + struct rmap_iterator iter; int write_protected = 0; - while ((spte = rmap_next(rmapp, spte))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + for (rmap_get_first(*rmapp, &iter); iter.sptep;) { + u64 spte = *iter.sptep; - if (!is_writable_pte(*spte)) + BUG_ON(!(spte & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", iter.sptep, spte); + + if (!is_writable_pte(spte)) { + rmap_get_next(&iter); continue; + } if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); + mmu_spte_update(iter.sptep, spte & ~PT_WRITABLE_MASK); + rmap_get_next(&iter); } else { - BUG_ON(!is_large_pte(*spte)); - drop_spte(kvm, spte); + BUG_ON(!is_large_pte(spte)); + drop_spte(kvm, iter.sptep); --kvm->stat.lpages; - spte = NULL; + rmap_get_first(*rmapp, &iter); } write_protected = 1; @@ -1084,48 +1124,58 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + struct rmap_iterator iter; int need_tlb_flush = 0; - while ((spte = rmap_next(rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); + while (rmap_get_first(*rmapp, &iter)) { + BUG_ON(!(*iter.sptep & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", + iter->sptep, *iter.sptep); + + drop_spte(kvm, iter.sptep); need_tlb_flush = 1; } + return need_tlb_flush; } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { + struct rmap_iterator iter; int need_flush = 0; - u64 *spte, new_spte; + u64 spte, new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + + for (rmap_get_first(*rmapp, &iter); iter.sptep;) { + spte = *iter.sptep; + + BUG_ON(!is_shadow_present_pte(spte)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", iter.sptep, spte); + need_flush = 1; + if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(rmapp, NULL); + drop_spte(kvm, iter.sptep); + rmap_get_first(*rmapp, &iter); } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte = spte & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(rmapp, spte); + + mmu_spte_clear_track_bits(iter.sptep); + mmu_spte_set(iter.sptep, new_spte); + rmap_get_next(&iter); } } + if (need_flush) kvm_flush_remote_tlbs(kvm); @@ -1184,7 +1234,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + struct rmap_iterator iter; int young = 0; /* @@ -1197,25 +1247,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { + for (rmap_get_first(*rmapp, &iter); iter.sptep; rmap_get_next(&iter)) { + BUG_ON(!(*iter.sptep & PT_PRESENT_MASK)); + + if (*iter.sptep & PT_ACCESSED_MASK) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)iter.sptep); } - spte = rmap_next(rmapp, spte); } + return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + struct rmap_iterator iter; int young = 0; /* @@ -1226,16 +1273,13 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { + for (rmap_get_first(*rmapp, &iter); iter.sptep; rmap_get_next(&iter)) { + BUG_ON(!(*iter.sptep & PT_PRESENT_MASK)); + + if (*iter.sptep & PT_ACCESSED_MASK) { young = 1; break; } - spte = rmap_next(rmapp, spte); } out: return young; @@ -1887,10 +1931,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 *parent_pte; + struct rmap_iterator iter; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); + while (rmap_get_first(sp->parent_ptes, &iter)) + drop_parent_pte(sp, iter.sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 6eabae3..568941d 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -192,7 +192,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; unsigned long *rmapp; - u64 *spte; + struct rmap_iterator iter; if (sp->role.direct || sp->unsync || sp->role.invalid) return; @@ -200,13 +200,11 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) + for (rmap_get_first(*rmapp, &iter); iter.sptep; rmap_get_next(&iter)) { + if (is_writable_pte(*iter.sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(rmapp, spte); } } -- 1.7.5.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html