Re: [RFC PATCH 06/37] KVM: MMU: Move struct kvm_mmu_page to common code

Ben Gardon <bgardon@xxxxxxxxxx> · Mon, 12 Dec 2022 10:07:36 -0800

On Thu, Dec 8, 2022 at 11:39 AM David Matlack <dmatlack@xxxxxxxxxx> wrote:
>
> Move struct kvm_mmu_page to common code and all x86-specific fields into
> kvm_mmu_page_arch.
>
> This commit increases the size of struct kvm_mmu_page by 64 bytes on
> x86_64 (184 bytes -> 248 bytes). The size of this struct can be reduced
> in future commits by moving TDP MMU root fields into a separate struct
> and by dynamically allocating fields only used by the Shadow MMU.
>
> No functional change intended.
>
> Signed-off-by: David Matlack <dmatlack@xxxxxxxxxx>

I haven't reviewed every line of the mechanical refactor of adding
"arch." all over the place, but at a high level, this looks like a
good split on struct kvm_mmu_page.

> ---
>  arch/x86/include/asm/kvm/mmu_types.h |  62 ++++++++++++++
>  arch/x86/include/asm/kvm_host.h      |   4 -
>  arch/x86/kvm/mmu/mmu.c               | 122 ++++++++++++++-------------
>  arch/x86/kvm/mmu/mmu_internal.h      |  83 ------------------
>  arch/x86/kvm/mmu/mmutrace.h          |   4 +-
>  arch/x86/kvm/mmu/paging_tmpl.h       |  10 +--
>  arch/x86/kvm/mmu/tdp_mmu.c           |   8 +-
>  arch/x86/kvm/mmu/tdp_mmu.h           |   2 +-
>  include/kvm/mmu_types.h              |  32 ++++++-
>  9 files changed, 167 insertions(+), 160 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm/mmu_types.h b/arch/x86/include/asm/kvm/mmu_types.h
> index 35f893ebab5a..affcb520b482 100644
> --- a/arch/x86/include/asm/kvm/mmu_types.h
> +++ b/arch/x86/include/asm/kvm/mmu_types.h
> @@ -2,6 +2,8 @@
>  #ifndef __ASM_KVM_MMU_TYPES_H
>  #define __ASM_KVM_MMU_TYPES_H
>
> +#include <linux/bitmap.h>
> +#include <linux/list.h>
>  #include <linux/types.h>
>
>  /*
> @@ -53,4 +55,64 @@ struct kvm_mmu_page_role_arch {
>         u16 passthrough:1;
>  };
>
> +struct kvm_rmap_head {
> +       unsigned long val;
> +};
> +
> +struct kvm_mmu_page_arch {
> +       struct hlist_node hash_link;
> +
> +       bool shadow_mmu_page;
> +       bool unsync;
> +       u8 mmu_valid_gen;
> +
> +        /*
> +         * The shadow page can't be replaced by an equivalent huge page
> +         * because it is being used to map an executable page in the guest
> +         * and the NX huge page mitigation is enabled.
> +         */
> +       bool nx_huge_page_disallowed;
> +
> +       /*
> +        * Stores the result of the guest translation being shadowed by each
> +        * SPTE.  KVM shadows two types of guest translations: nGPA -> GPA
> +        * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
> +        * cases the result of the translation is a GPA and a set of access
> +        * constraints.
> +        *
> +        * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
> +        * access permissions are stored in the lower bits. Note, for
> +        * convenience and uniformity across guests, the access permissions are
> +        * stored in KVM format (e.g.  ACC_EXEC_MASK) not the raw guest format.
> +        */
> +       u64 *shadowed_translation;
> +
> +       unsigned int unsync_children;
> +
> +       /* Rmap pointers to all parent sptes. */
> +       struct kvm_rmap_head parent_ptes;
> +
> +       DECLARE_BITMAP(unsync_child_bitmap, 512);
> +
> +       /*
> +        * Tracks shadow pages that, if zapped, would allow KVM to create an NX
> +        * huge page.  A shadow page will have nx_huge_page_disallowed set but
> +        * not be on the list if a huge page is disallowed for other reasons,
> +        * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
> +        * isn't properly aligned, etc...
> +        */
> +       struct list_head possible_nx_huge_page_link;
> +
> +#ifdef CONFIG_X86_32
> +       /*
> +        * Used out of the mmu-lock to avoid reading spte values while an
> +        * update is in progress; see the comments in __get_spte_lockless().
> +        */
> +       int clear_spte_count;
> +#endif
> +
> +       /* Number of writes since the last time traversal visited this page.  */
> +       atomic_t write_flooding_count;
> +};
> +
>  #endif /* !__ASM_KVM_MMU_TYPES_H */
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ebcd7a0dabef..f5743a652e10 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -329,10 +329,6 @@ union kvm_cpu_role {
>         };
>  };
>
> -struct kvm_rmap_head {
> -       unsigned long val;
> -};
> -
>  struct kvm_pio_request {
>         unsigned long linear_rip;
>         unsigned long count;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 11cef930d5ed..e47f35878ab5 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -350,7 +350,7 @@ static void count_spte_clear(u64 *sptep, u64 spte)
>
>         /* Ensure the spte is completely set before we increase the count */
>         smp_wmb();
> -       sp->clear_spte_count++;
> +       sp->arch.clear_spte_count++;
>  }
>
>  static void __set_spte(u64 *sptep, u64 spte)
> @@ -432,7 +432,7 @@ static u64 __get_spte_lockless(u64 *sptep)
>         int count;
>
>  retry:
> -       count = sp->clear_spte_count;
> +       count = sp->arch.clear_spte_count;
>         smp_rmb();
>
>         spte.spte_low = orig->spte_low;
> @@ -442,7 +442,7 @@ static u64 __get_spte_lockless(u64 *sptep)
>         smp_rmb();
>
>         if (unlikely(spte.spte_low != orig->spte_low ||
> -             count != sp->clear_spte_count))
> +             count != sp->arch.clear_spte_count))
>                 goto retry;
>
>         return spte.spte;
> @@ -699,7 +699,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
>                 return sp->gfn;
>
>         if (!sp->role.arch.direct)
> -               return sp->shadowed_translation[index] >> PAGE_SHIFT;
> +               return sp->arch.shadowed_translation[index] >> PAGE_SHIFT;
>
>         return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
>  }
> @@ -713,7 +713,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
>  static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
>  {
>         if (sp_has_gptes(sp))
> -               return sp->shadowed_translation[index] & ACC_ALL;
> +               return sp->arch.shadowed_translation[index] & ACC_ALL;
>
>         /*
>          * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
> @@ -734,7 +734,7 @@ static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
>                                          gfn_t gfn, unsigned int access)
>  {
>         if (sp_has_gptes(sp)) {
> -               sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
> +               sp->arch.shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
>                 return;
>         }
>
> @@ -825,18 +825,18 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>          * on the list if KVM is reusing an existing shadow page, i.e. if KVM
>          * links a shadow page at multiple points.
>          */
> -       if (!list_empty(&sp->possible_nx_huge_page_link))
> +       if (!list_empty(&sp->arch.possible_nx_huge_page_link))
>                 return;
>
>         ++kvm->stat.nx_lpage_splits;
> -       list_add_tail(&sp->possible_nx_huge_page_link,
> +       list_add_tail(&sp->arch.possible_nx_huge_page_link,
>                       &kvm->arch.possible_nx_huge_pages);
>  }
>
>  static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>                                  bool nx_huge_page_possible)
>  {
> -       sp->nx_huge_page_disallowed = true;
> +       sp->arch.nx_huge_page_disallowed = true;
>
>         if (nx_huge_page_possible)
>                 track_possible_nx_huge_page(kvm, sp);
> @@ -861,16 +861,16 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
>
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       if (list_empty(&sp->possible_nx_huge_page_link))
> +       if (list_empty(&sp->arch.possible_nx_huge_page_link))
>                 return;
>
>         --kvm->stat.nx_lpage_splits;
> -       list_del_init(&sp->possible_nx_huge_page_link);
> +       list_del_init(&sp->arch.possible_nx_huge_page_link);
>  }
>
>  static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       sp->nx_huge_page_disallowed = false;
> +       sp->arch.nx_huge_page_disallowed = false;
>
>         untrack_possible_nx_huge_page(kvm, sp);
>  }
> @@ -1720,11 +1720,11 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
>  {
>         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
> -       hlist_del(&sp->hash_link);
> +       hlist_del(&sp->arch.hash_link);
>         list_del(&sp->link);
>         free_page((unsigned long)sp->spt);
>         if (!sp->role.arch.direct)
> -               free_page((unsigned long)sp->shadowed_translation);
> +               free_page((unsigned long)sp->arch.shadowed_translation);
>         kmem_cache_free(mmu_page_header_cache, sp);
>  }
>
> @@ -1739,13 +1739,13 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
>         if (!parent_pte)
>                 return;
>
> -       pte_list_add(cache, parent_pte, &sp->parent_ptes);
> +       pte_list_add(cache, parent_pte, &sp->arch.parent_ptes);
>  }
>
>  static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
>                                        u64 *parent_pte)
>  {
> -       pte_list_remove(parent_pte, &sp->parent_ptes);
> +       pte_list_remove(parent_pte, &sp->arch.parent_ptes);
>  }
>
>  static void drop_parent_pte(struct kvm_mmu_page *sp,
> @@ -1761,7 +1761,7 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
>         u64 *sptep;
>         struct rmap_iterator iter;
>
> -       for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
> +       for_each_rmap_spte(&sp->arch.parent_ptes, &iter, sptep) {
>                 mark_unsync(sptep);
>         }
>  }
> @@ -1771,9 +1771,9 @@ static void mark_unsync(u64 *spte)
>         struct kvm_mmu_page *sp;
>
>         sp = sptep_to_sp(spte);
> -       if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
> +       if (__test_and_set_bit(spte_index(spte), sp->arch.unsync_child_bitmap))
>                 return;
> -       if (sp->unsync_children++)
> +       if (sp->arch.unsync_children++)
>                 return;
>         kvm_mmu_mark_parents_unsync(sp);
>  }
> @@ -1799,7 +1799,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
>  {
>         int i;
>
> -       if (sp->unsync)
> +       if (sp->arch.unsync)
>                 for (i=0; i < pvec->nr; i++)
>                         if (pvec->page[i].sp == sp)
>                                 return 0;
> @@ -1812,9 +1812,9 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
>
>  static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
>  {
> -       --sp->unsync_children;
> -       WARN_ON((int)sp->unsync_children < 0);
> -       __clear_bit(idx, sp->unsync_child_bitmap);
> +       --sp->arch.unsync_children;
> +       WARN_ON((int)sp->arch.unsync_children < 0);
> +       __clear_bit(idx, sp->arch.unsync_child_bitmap);
>  }
>
>  static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
> @@ -1822,7 +1822,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
>  {
>         int i, ret, nr_unsync_leaf = 0;
>
> -       for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
> +       for_each_set_bit(i, sp->arch.unsync_child_bitmap, 512) {
>                 struct kvm_mmu_page *child;
>                 u64 ent = sp->spt[i];
>
> @@ -1833,7 +1833,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
>
>                 child = spte_to_child_sp(ent);
>
> -               if (child->unsync_children) {
> +               if (child->arch.unsync_children) {
>                         if (mmu_pages_add(pvec, child, i))
>                                 return -ENOSPC;
>
> @@ -1845,7 +1845,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
>                                 nr_unsync_leaf += ret;
>                         } else
>                                 return ret;
> -               } else if (child->unsync) {
> +               } else if (child->arch.unsync) {
>                         nr_unsync_leaf++;
>                         if (mmu_pages_add(pvec, child, i))
>                                 return -ENOSPC;
> @@ -1862,7 +1862,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
>                            struct kvm_mmu_pages *pvec)
>  {
>         pvec->nr = 0;
> -       if (!sp->unsync_children)
> +       if (!sp->arch.unsync_children)
>                 return 0;
>
>         mmu_pages_add(pvec, sp, INVALID_INDEX);
> @@ -1871,9 +1871,9 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
>
>  static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       WARN_ON(!sp->unsync);
> +       WARN_ON(!sp->arch.unsync);
>         trace_kvm_mmu_sync_page(sp);
> -       sp->unsync = 0;
> +       sp->arch.unsync = 0;
>         --kvm->stat.mmu_unsync;
>  }
>
> @@ -1894,7 +1894,7 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
>  }
>
>  #define for_each_valid_sp(_kvm, _sp, _list)                            \
> -       hlist_for_each_entry(_sp, _list, hash_link)                     \
> +       hlist_for_each_entry(_sp, _list, arch.hash_link)                        \
>                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
>                 } else
>
> @@ -1934,7 +1934,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>
>         /* TDP MMU pages do not use the MMU generation. */
>         return !is_tdp_mmu_page(sp) &&
> -              unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> +              unlikely(sp->arch.mmu_valid_gen != kvm->arch.mmu_valid_gen);
>  }
>
>  struct mmu_page_path {
> @@ -2006,7 +2006,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
>                 WARN_ON(idx == INVALID_INDEX);
>                 clear_unsync_child_bit(sp, idx);
>                 level++;
> -       } while (!sp->unsync_children);
> +       } while (!sp->arch.unsync_children);
>  }
>
>  static int mmu_sync_children(struct kvm_vcpu *vcpu,
> @@ -2053,7 +2053,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
>
>  static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
>  {
> -       atomic_set(&sp->write_flooding_count,  0);
> +       atomic_set(&sp->arch.write_flooding_count,  0);
>  }
>
>  static void clear_sp_write_flooding_count(u64 *spte)
> @@ -2094,7 +2094,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
>                          * Unsync pages must not be left as is, because the new
>                          * upper-level page will be write-protected.
>                          */
> -                       if (role.level > PG_LEVEL_4K && sp->unsync)
> +                       if (role.level > PG_LEVEL_4K && sp->arch.unsync)
>                                 kvm_mmu_prepare_zap_page(kvm, sp,
>                                                          &invalid_list);
>                         continue;
> @@ -2104,7 +2104,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
>                 if (sp->role.arch.direct)
>                         goto out;
>
> -               if (sp->unsync) {
> +               if (sp->arch.unsync) {
>                         if (KVM_BUG_ON(!vcpu, kvm))
>                                 break;
>
> @@ -2163,25 +2163,26 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
>         sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
>         if (!role.arch.direct)
> -               sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
> +               sp->arch.shadowed_translation =
> +                       kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>
>         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
>
> -       INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
> +       INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link);
>
>         /*
>          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
>          * depends on valid pages being added to the head of the list.  See
>          * comments in kvm_zap_obsolete_pages().
>          */
> -       sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
> +       sp->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen;
>         list_add(&sp->link, &kvm->arch.active_mmu_pages);
>         kvm_account_mmu_page(kvm, sp);
>
>         sp->gfn = gfn;
>         sp->role = role;
> -       sp->shadow_mmu_page = true;
> -       hlist_add_head(&sp->hash_link, sp_list);
> +       sp->arch.shadow_mmu_page = true;
> +       hlist_add_head(&sp->arch.hash_link, sp_list);
>         if (sp_has_gptes(sp))
>                 account_shadowed(kvm, sp);
>
> @@ -2368,7 +2369,7 @@ static void __link_shadow_page(struct kvm *kvm,
>
>         mmu_page_add_parent_pte(cache, sp, sptep);
>
> -       if (sp->unsync_children || sp->unsync)
> +       if (sp->arch.unsync_children || sp->arch.unsync)
>                 mark_unsync(sptep);
>  }
>
> @@ -2421,7 +2422,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
>                          * avoids retaining a large number of stale nested SPs.
>                          */
>                         if (tdp_enabled && invalid_list &&
> -                           child->role.arch.guest_mode && !child->parent_ptes.val)
> +                           child->role.arch.guest_mode &&
> +                           !child->arch.parent_ptes.val)
>                                 return kvm_mmu_prepare_zap_page(kvm, child,
>                                                                 invalid_list);
>                 }
> @@ -2449,7 +2451,7 @@ static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
>         u64 *sptep;
>         struct rmap_iterator iter;
>
> -       while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
> +       while ((sptep = rmap_get_first(&sp->arch.parent_ptes, &iter)))
>                 drop_parent_pte(sp, sptep);
>  }
>
> @@ -2496,7 +2498,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
>         if (!sp->role.invalid && sp_has_gptes(sp))
>                 unaccount_shadowed(kvm, sp);
>
> -       if (sp->unsync)
> +       if (sp->arch.unsync)
>                 kvm_unlink_unsync_page(kvm, sp);
>         if (!refcount_read(&sp->root_refcount)) {
>                 /* Count self */
> @@ -2527,7 +2529,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
>                 zapped_root = !is_obsolete_sp(kvm, sp);
>         }
>
> -       if (sp->nx_huge_page_disallowed)
> +       if (sp->arch.nx_huge_page_disallowed)
>                 unaccount_nx_huge_page(kvm, sp);
>
>         sp->role.invalid = 1;
> @@ -2704,7 +2706,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
>         trace_kvm_mmu_unsync_page(sp);
>         ++kvm->stat.mmu_unsync;
> -       sp->unsync = 1;
> +       sp->arch.unsync = 1;
>
>         kvm_mmu_mark_parents_unsync(sp);
>  }
> @@ -2739,7 +2741,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
>                 if (!can_unsync)
>                         return -EPERM;
>
> -               if (sp->unsync)
> +               if (sp->arch.unsync)
>                         continue;
>
>                 if (prefetch)
> @@ -2764,7 +2766,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
>                          * for write, i.e. unsync cannot transition from 0->1
>                          * while this CPU holds mmu_lock for read (or write).
>                          */
> -                       if (READ_ONCE(sp->unsync))
> +                       if (READ_ONCE(sp->arch.unsync))
>                                 continue;
>                 }
>
> @@ -2796,8 +2798,8 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
>          *                      2.2 Guest issues TLB flush.
>          *                          That causes a VM Exit.
>          *
> -        *                      2.3 Walking of unsync pages sees sp->unsync is
> -        *                          false and skips the page.
> +        *                      2.3 Walking of unsync pages sees sp->arch.unsync
> +        *                          is false and skips the page.
>          *
>          *                      2.4 Guest accesses GVA X.
>          *                          Since the mapping in the SP was not updated,
> @@ -2805,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
>          *                          gets used.
>          * 1.1 Host marks SP
>          *     as unsync
> -        *     (sp->unsync = true)
> +        *     (sp->arch.unsync = true)
>          *
>          * The write barrier below ensures that 1.1 happens before 1.2 and thus
>          * the situation in 2.4 does not arise.  It pairs with the read barrier
> @@ -3126,7 +3128,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
>             cur_level == fault->goal_level &&
>             is_shadow_present_pte(spte) &&
>             !is_large_pte(spte) &&
> -           spte_to_child_sp(spte)->nx_huge_page_disallowed) {
> +           spte_to_child_sp(spte)->arch.nx_huge_page_disallowed) {
>                 /*
>                  * A small SPTE exists for this pfn, but FNAME(fetch),
>                  * direct_map(), or kvm_tdp_mmu_map() would like to create a
> @@ -3902,7 +3904,7 @@ static bool is_unsync_root(hpa_t root)
>
>         /*
>          * The read barrier orders the CPU's read of SPTE.W during the page table
> -        * walk before the reads of sp->unsync/sp->unsync_children here.
> +        * walk before the reads of sp->arch.{unsync,unsync_children} here.
>          *
>          * Even if another CPU was marking the SP as unsync-ed simultaneously,
>          * any guest page table changes are not guaranteed to be visible anyway
> @@ -3922,7 +3924,7 @@ static bool is_unsync_root(hpa_t root)
>         if (WARN_ON_ONCE(!sp))
>                 return false;
>
> -       if (sp->unsync || sp->unsync_children)
> +       if (sp->arch.unsync || sp->arch.unsync_children)
>                 return true;
>
>         return false;
> @@ -5510,8 +5512,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
>         if (sp->role.level == PG_LEVEL_4K)
>                 return false;
>
> -       atomic_inc(&sp->write_flooding_count);
> -       return atomic_read(&sp->write_flooding_count) >= 3;
> +       atomic_inc(&sp->arch.write_flooding_count);
> +       return atomic_read(&sp->arch.write_flooding_count) >= 3;
>  }
>
>  /*
> @@ -6389,7 +6391,7 @@ static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
>                         continue;
>
>                 /* SPs with level >PG_LEVEL_4K should never by unsync. */
> -               if (WARN_ON_ONCE(sp->unsync))
> +               if (WARN_ON_ONCE(sp->arch.unsync))
>                         continue;
>
>                 /* Don't bother splitting huge pages on invalid SPs. */
> @@ -6941,8 +6943,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
>                  */
>                 sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
>                                       struct kvm_mmu_page,
> -                                     possible_nx_huge_page_link);
> -               WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
> +                                     arch.possible_nx_huge_page_link);
> +               WARN_ON_ONCE(!sp->arch.nx_huge_page_disallowed);
>                 WARN_ON_ONCE(!sp->role.arch.direct);
>
>                 /*
> @@ -6977,7 +6979,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
>                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
>                 else
>                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> -               WARN_ON_ONCE(sp->nx_huge_page_disallowed);
> +               WARN_ON_ONCE(sp->arch.nx_huge_page_disallowed);
>
>                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
>                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index fd4990c8b0e9..af2ae4887e35 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -44,89 +44,6 @@ extern bool dbg;
>  #define INVALID_PAE_ROOT       0
>  #define IS_VALID_PAE_ROOT(x)   (!!(x))
>
> -struct kvm_mmu_page {
> -       /*
> -        * Note, "link" through "spt" fit in a single 64 byte cache line on
> -        * 64-bit kernels, keep it that way unless there's a reason not to.
> -        */
> -       struct list_head link;
> -       struct hlist_node hash_link;
> -
> -       bool shadow_mmu_page;
> -       bool unsync;
> -       u8 mmu_valid_gen;
> -
> -        /*
> -         * The shadow page can't be replaced by an equivalent huge page
> -         * because it is being used to map an executable page in the guest
> -         * and the NX huge page mitigation is enabled.
> -         */
> -       bool nx_huge_page_disallowed;
> -
> -       /*
> -        * The following two entries are used to key the shadow page in the
> -        * hash table.
> -        */
> -       union kvm_mmu_page_role role;
> -       gfn_t gfn;
> -
> -       u64 *spt;
> -
> -       /*
> -        * Stores the result of the guest translation being shadowed by each
> -        * SPTE.  KVM shadows two types of guest translations: nGPA -> GPA
> -        * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
> -        * cases the result of the translation is a GPA and a set of access
> -        * constraints.
> -        *
> -        * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
> -        * access permissions are stored in the lower bits. Note, for
> -        * convenience and uniformity across guests, the access permissions are
> -        * stored in KVM format (e.g.  ACC_EXEC_MASK) not the raw guest format.
> -        */
> -       u64 *shadowed_translation;
> -
> -       /* Currently serving as active root */
> -       refcount_t root_refcount;
> -
> -       unsigned int unsync_children;
> -       union {
> -               struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
> -               tdp_ptep_t ptep;
> -       };
> -       union {
> -               DECLARE_BITMAP(unsync_child_bitmap, 512);
> -               struct {
> -                       struct work_struct tdp_mmu_async_work;
> -                       void *tdp_mmu_async_data;
> -               };
> -       };
> -
> -       /*
> -        * Tracks shadow pages that, if zapped, would allow KVM to create an NX
> -        * huge page.  A shadow page will have nx_huge_page_disallowed set but
> -        * not be on the list if a huge page is disallowed for other reasons,
> -        * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
> -        * isn't properly aligned, etc...
> -        */
> -       struct list_head possible_nx_huge_page_link;
> -#ifdef CONFIG_X86_32
> -       /*
> -        * Used out of the mmu-lock to avoid reading spte values while an
> -        * update is in progress; see the comments in __get_spte_lockless().
> -        */
> -       int clear_spte_count;
> -#endif
> -
> -       /* Number of writes since the last time traversal visited this page.  */
> -       atomic_t write_flooding_count;
> -
> -#ifdef CONFIG_X86_64
> -       /* Used for freeing the page asynchronously if it is a TDP MMU page. */
> -       struct rcu_head rcu_head;
> -#endif
> -};
> -
>  extern struct kmem_cache *mmu_page_header_cache;
>
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index ffd10ce3eae3..335f26dabdf3 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -16,11 +16,11 @@
>         __field(bool, unsync)
>
>  #define KVM_MMU_PAGE_ASSIGN(sp)                                \
> -       __entry->mmu_valid_gen = sp->mmu_valid_gen;     \
> +       __entry->mmu_valid_gen = sp->arch.mmu_valid_gen;        \
>         __entry->gfn = sp->gfn;                         \
>         __entry->role = sp->role.word;                  \
>         __entry->root_count = refcount_read(&sp->root_refcount);        \
> -       __entry->unsync = sp->unsync;
> +       __entry->unsync = sp->arch.unsync;
>
>  #define KVM_MMU_PAGE_PRINTK() ({                                       \
>         const char *saved_ptr = trace_seq_buffer_ptr(p);                \
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index e15ec1c473da..18bb92b70a01 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -671,7 +671,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>                          * KVM_REQ_MMU_SYNC is not necessary but it
>                          * expedites the process.
>                          */
> -                       if (sp->unsync_children &&
> +                       if (sp->arch.unsync_children &&
>                             mmu_sync_children(vcpu, sp, false))
>                                 return RET_PF_RETRY;
>                 }
> @@ -921,7 +921,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>                         pt_element_t gpte;
>                         gpa_t pte_gpa;
>
> -                       if (!sp->unsync)
> +                       if (!sp->arch.unsync)
>                                 break;
>
>                         pte_gpa = FNAME(get_level1_sp_gpa)(sp);
> @@ -942,7 +942,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>                         FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
>                 }
>
> -               if (!sp->unsync_children)
> +               if (!sp->arch.unsync_children)
>                         break;
>         }
>         write_unlock(&vcpu->kvm->mmu_lock);
> @@ -974,8 +974,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  }
>
>  /*
> - * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
> - * safe because:
> + * Using the information in sp->arch.shadowed_translation
> + * (kvm_mmu_page_get_gfn()) is safe because:
>   * - The spte has a reference to the struct page, so the pfn for a given gfn
>   *   can't change unless all sptes pointing to it are nuked first.
>   *
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 34d674080170..66231c7ed31e 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -270,7 +270,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>  static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>                             gfn_t gfn, union kvm_mmu_page_role role)
>  {
> -       INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
> +       INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link);
>
>         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
>
> @@ -385,7 +385,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
>  {
>         tdp_unaccount_mmu_page(kvm, sp);
>
> -       if (!sp->nx_huge_page_disallowed)
> +       if (!sp->arch.nx_huge_page_disallowed)
>                 return;
>
>         if (shared)
> @@ -393,7 +393,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
>         else
>                 lockdep_assert_held_write(&kvm->mmu_lock);
>
> -       sp->nx_huge_page_disallowed = false;
> +       sp->arch.nx_huge_page_disallowed = false;
>         untrack_possible_nx_huge_page(kvm, sp);
>
>         if (shared)
> @@ -1181,7 +1181,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>                 sp = tdp_mmu_alloc_sp(vcpu);
>                 tdp_mmu_init_child_sp(sp, &iter);
>
> -               sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
> +               sp->arch.nx_huge_page_disallowed = fault->huge_page_disallowed;
>
>                 if (is_shadow_present_pte(iter.old_spte))
>                         r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 19d3153051a3..e6a929089715 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -73,7 +73,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
>  #ifdef CONFIG_X86_64
>  static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp)
>  {
> -       return !sp->shadow_mmu_page;
> +       return !sp->arch.shadow_mmu_page;
>  }
>  #else
>  static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
> diff --git a/include/kvm/mmu_types.h b/include/kvm/mmu_types.h
> index 14099956fdac..a9da33d4baa8 100644
> --- a/include/kvm/mmu_types.h
> +++ b/include/kvm/mmu_types.h
> @@ -3,8 +3,11 @@
>  #define __KVM_MMU_TYPES_H
>
>  #include <linux/bug.h>
> -#include <linux/types.h>
> +#include <linux/kvm_types.h>
> +#include <linux/refcount.h>
>  #include <linux/stddef.h>
> +#include <linux/types.h>
> +#include <linux/workqueue.h>
>
>  #include <asm/kvm/mmu_types.h>
>
> @@ -36,4 +39,31 @@ static_assert(sizeof(union kvm_mmu_page_role) == sizeof_field(union kvm_mmu_page
>
>  typedef u64 __rcu *tdp_ptep_t;
>
> +struct kvm_mmu_page {
> +       struct list_head link;
> +
> +       union kvm_mmu_page_role role;
> +
> +       /* The start of the GFN region mapped by this shadow page. */
> +       gfn_t gfn;

I'd like to put in a vote for getting rid of the "shadow page" /
"shadow page table entry" terminology through this refactor.
I can't count the number of times folks trying to understand the MMU
have gotten confused by the overloaded terminology.

> +
> +       /* The page table page. */
> +       u64 *spt;
> +
> +       /* The PTE that points to this shadow page. */
> +       tdp_ptep_t ptep;

Totally fine to not change this, but something like parent_ptep would
be clearer. The Shadow MMU uses parent_pteps, which I think is much
clearer.
Could also change the comment to just:
/* The PTE that points to this kvm_mmu_page. */

> +
> +       /* Used for freeing TDP MMU pages asynchronously. */
> +       struct rcu_head rcu_head;
> +
> +       /* The number of references to this shadow page as a root. */
> +       refcount_t root_refcount;
> +
> +       /* Used for tearing down an entire page table tree. */
> +       struct work_struct tdp_mmu_async_work;
> +       void *tdp_mmu_async_data;
> +
> +       struct kvm_mmu_page_arch arch;
> +};
> +
>  #endif /* !__KVM_MMU_TYPES_H */
> --
> 2.39.0.rc1.256.g54fd8350bd-goog
>
_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm