On Thu, Nov 18, 2021, Sean Christopherson wrote: > Another idea. The only difference between 5-level and 4-level is that 5-level > fills in index [4], and I'm pretty sure 4-level doesn't touch that index. For > PAE NPT (32-bit SVM), the shadow root level will never change, so that's not an issue. > > Nested NPT is the only case where anything for an EPT/NPT MMU can change, because > that follows EFER.NX. > > In other words, the non-nested TDP reserved bits don't need to be recalculated > regardless of level, they can just fill in 5-level and leave it be. > > E.g. something like the below. The sp->role.direct check could be removed if we > forced EFER.NX for nested NPT. > > It's a bit ugly in that we'd pass both @kvm and @vcpu, so that needs some more > thought, but at minimum it means there's no need to recalc the reserved bits. Ok, I think my final vote is to have the reserved bits passed in, but with the non-nested TDP reserved bits being computed at MMU init. I would also prefer to keep the existing make_spte() name so that there's no churn in those call sites, and to make the relationship between the wrapper, mask_spte(), and the "real" helper, __make_spte(), more obvious and aligned with the usual kernel style. So with the kvm_vcpu_ad_need_write_protect() change and my proposed hack-a-fix for kvm_x86_get_mt_mask(), the end result would look like: bool __make_spte(struct kvm *kvm, struct kvm_mmu_page *sp, struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte, struct rsvd_bits_validate *shadow_rsvd_bits) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent * read access. See FNAME(gpte_access) in paging_tmpl.h. */ spte |= shadow_present_mask; if (!prefetch) spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled()) { pte_access &= ~ACC_EXEC_MASK; } if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= static_call(kvm_x86_get_mt_mask)(kvm, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; if (!kvm_is_mmio_pfn(pfn)) spte |= shadow_me_mask; spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ if (is_writable_pte(old_spte)) goto out; /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ if (mmu_try_to_unsync_pages(kvm, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); out: if (prefetch) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(shadow_rsvd_bits), spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&shadow_rsvd_bits, spte, level)); if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON(level > PG_LEVEL_4K); mark_page_dirty_in_slot(kvm, slot, gfn); } *new_spte = spte; return wrprot; } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte) { return __make_spte(vcpu->kvm, sp, slot, pte_access, gfn, pfn, old_spte, prefetch, can_unsync, host_writable, new_spte, &vcpu->arch.mmu->shadow_zero_check); }