On Wed, Feb 2, 2022 at 5:02 PM David Matlack <dmatlack@xxxxxxxxxx> wrote: > > In order to split a huge page we need to know what access bits to assign > to the role of the new child page table. This can't be easily derived > from the huge page SPTE itself since KVM applies its own access policies > on top, such as for HugePage NX. > > We could walk the guest page tables to determine the correct access > bits, but that is difficult to plumb outside of a vCPU fault context. > Instead, we can store the original access bits for each leaf SPTE > alongside the GFN in the gfns array. The access bits only take up 3 > bits, which leaves 61 bits left over for gfns, which is more than > enough. So this change does not require any additional memory. > > In order to keep the access bit cache in sync with the guest, we have to > extend FNAME(sync_page) to also update the access bits. > > Now that the gfns array caches more information than just GFNs, rename > it to shadowed_translation. > > No functional change intended. This sounds like a functional change, but otherwise seems reasonable. > > Signed-off-by: David Matlack <dmatlack@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 2 +- > arch/x86/kvm/mmu/mmu.c | 32 +++++++++++++++++++------------- > arch/x86/kvm/mmu/mmu_internal.h | 15 +++++++++++++-- > arch/x86/kvm/mmu/paging_tmpl.h | 7 +++++-- > 4 files changed, 38 insertions(+), 18 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index c371ee7e45f7..f00004c13ccf 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -686,7 +686,7 @@ struct kvm_vcpu_arch { > > struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; > struct kvm_mmu_memory_cache mmu_shadow_page_cache; > - struct kvm_mmu_memory_cache mmu_gfn_array_cache; > + struct kvm_mmu_memory_cache mmu_shadowed_translation_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > /* > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index ae1564e67e49..e2306a39526a 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -719,7 +719,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) > if (r) > return r; > if (maybe_indirect) { > - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, > + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_translation_cache, > PT64_ROOT_MAX_LEVEL); > if (r) > return r; > @@ -732,7 +732,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) > { > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); > - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); > + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_translation_cache); > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); > } > > @@ -749,15 +749,17 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) > static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) > { > if (!sp->role.direct) > - return sp->gfns[index]; > + return sp->shadowed_translation[index].gfn; > > return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); > } > > -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) > +static void kvm_mmu_page_set_gfn_access(struct kvm_mmu_page *sp, int index, > + gfn_t gfn, u32 access) > { > if (!sp->role.direct) { > - sp->gfns[index] = gfn; > + sp->shadowed_translation[index].gfn = gfn; > + sp->shadowed_translation[index].access = access; > return; > } > > @@ -1610,14 +1612,14 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > static void __rmap_add(struct kvm *kvm, > struct kvm_mmu_memory_cache *cache, > const struct kvm_memory_slot *slot, > - u64 *spte, gfn_t gfn) > + u64 *spte, gfn_t gfn, u32 access) > { > struct kvm_mmu_page *sp; > struct kvm_rmap_head *rmap_head; > int rmap_count; > > sp = sptep_to_sp(spte); > - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); > + kvm_mmu_page_set_gfn_access(sp, spte - sp->spt, gfn, access); > rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); > rmap_count = pte_list_add(cache, spte, rmap_head); > > @@ -1631,9 +1633,9 @@ static void __rmap_add(struct kvm *kvm, > } > > static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, > - u64 *spte, gfn_t gfn) > + u64 *spte, gfn_t gfn, u32 access) > { > - __rmap_add(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, slot, spte, gfn); > + __rmap_add(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, slot, spte, gfn, access); > } > > bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) > @@ -1694,7 +1696,7 @@ void kvm_mmu_free_sp(struct kvm_mmu_page *sp) > { > free_page((unsigned long)sp->spt); > if (!sp->role.direct) > - free_page((unsigned long)sp->gfns); > + free_page((unsigned long)sp->shadowed_translation); > kmem_cache_free(mmu_page_header_cache, sp); > } > > @@ -1731,8 +1733,12 @@ struct kvm_mmu_page *kvm_mmu_alloc_sp(struct kvm_vcpu *vcpu, bool direct) > > sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); > sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); > + > + BUILD_BUG_ON(sizeof(sp->shadowed_translation[0]) != sizeof(u64)); > + > if (!direct) > - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); > + sp->shadowed_translation = > + kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadowed_translation_cache); > > return sp; > } > @@ -1742,7 +1748,7 @@ struct kvm_mmu_page *kvm_mmu_alloc_sp(struct kvm_vcpu *vcpu, bool direct) > * > * Huge page splitting always uses direct shadow pages since the huge page is > * being mapped directly with a lower level page table. Thus there's no need to > - * allocate the gfns array. > + * allocate the shadowed_translation array. > */ > struct kvm_mmu_page *kvm_mmu_alloc_direct_sp_for_split(gfp_t gfp) > { > @@ -2833,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > if (!was_rmapped) { > WARN_ON_ONCE(ret == RET_PF_SPURIOUS); > - rmap_add(vcpu, slot, sptep, gfn); > + rmap_add(vcpu, slot, sptep, gfn, pte_access); > } > > return ret; > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index e6bcea5a0aa9..9ee175adcc12 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -30,6 +30,11 @@ extern bool dbg; > #define INVALID_PAE_ROOT 0 > #define IS_VALID_PAE_ROOT(x) (!!(x)) > > +struct shadowed_translation_entry { > + u64 access:3; > + u64 gfn:56; > +}; > + > struct kvm_mmu_page { > /* > * Note, "link" through "spt" fit in a single 64 byte cache line on > @@ -51,8 +56,14 @@ struct kvm_mmu_page { > gfn_t gfn; > > u64 *spt; > - /* hold the gfn of each spte inside spt */ > - gfn_t *gfns; > + /* > + * For indirect shadow pages, caches the result of the intermediate > + * guest translation being shadowed by each SPTE. > + * > + * NULL for direct shadow pages. > + */ > + struct shadowed_translation_entry *shadowed_translation; > + > /* Currently serving as active root */ > union { > int root_count; > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index c533c191925e..703dfb062bf0 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -1016,7 +1016,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > } > > /* > - * Using the cached information from sp->gfns is safe because: > + * Using the information in sp->shadowed_translation is safe because: > * - The spte has a reference to the struct page, so the pfn for a given gfn > * can't change unless all sptes pointing to it are nuked first. > * > @@ -1090,12 +1090,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) > if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) > continue; > > - if (gfn != sp->gfns[i]) { > + if (gfn != sp->shadowed_translation[i].gfn) { > drop_spte(vcpu->kvm, &sp->spt[i]); > flush = true; > continue; > } > > + if (pte_access != sp->shadowed_translation[i].access) > + sp->shadowed_translation[i].access = pte_access; > + > sptep = &sp->spt[i]; > spte = *sptep; > host_writable = spte & shadow_host_writable_mask; > -- > 2.35.0.rc2.247.g8bbb082509-goog >