On Mon, Jun 27, 2022 at 02:53:48PM -0700, isaku.yamahata@xxxxxxxxx wrote: > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> Subject: s/git/bit > > With TDX, all GFNs are private at guest boot time. At run time guest TD > can explicitly change it to shared from private or vice-versa by MapGPA > hypercall. If it's specified, the given GFN can't be used as otherwise. > That's is, if a guest tells KVM that the GFN is shared, it can't be used > as private. or vice-versa. > > Steal software usable bit, SPTE_SHARED_MASK, for it from MMIO counter to > record it. Use the bit SPTE_SHARED_MASK in shared or private EPT to > determine which mapping, shared or private, is allowed. If requested > mapping isn't allowed, return RET_PF_RETRY to wait for other vcpu to change > it. The bit is recorded in both shared and private shadow page to avoid > traverse one more shadow page when resolving KVM page fault. > > The bit needs to be kept over zapping the EPT entry. Currently the EPT > entry is initialized SHADOW_NONPRESENT_VALUE unconditionally to clear > SPTE_SHARED_MASK bit. To carry SPTE_SHARED_MASK bit, introduce a helper > function to get initial value for zapped entry with SPTE_SHARED_MASK bit. > Replace SHADOW_NONPRESENT_VALUE with it. > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > --- > arch/x86/kvm/mmu/spte.h | 17 +++++++--- > arch/x86/kvm/mmu/tdp_mmu.c | 65 ++++++++++++++++++++++++++++++++------ > 2 files changed, 68 insertions(+), 14 deletions(-) > > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h > index 96312ab4fffb..7c1aaf0e963e 100644 > --- a/arch/x86/kvm/mmu/spte.h > +++ b/arch/x86/kvm/mmu/spte.h > @@ -14,6 +14,9 @@ > */ > #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) > > +/* Masks that used to track for shared GPA **/ > +#define SPTE_SHARED_MASK BIT_ULL(62) > + > /* > * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also > * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. > @@ -104,7 +107,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); > * the memslots generation and is derived as follows: > * > * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 > - * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 > + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-61 Should be 8-17. > * > * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in > * the MMIO generation number, as doing so would require stealing a bit from > @@ -118,7 +121,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); > #define MMIO_SPTE_GEN_LOW_END 10 > > #define MMIO_SPTE_GEN_HIGH_START 52 > -#define MMIO_SPTE_GEN_HIGH_END 62 > +#define MMIO_SPTE_GEN_HIGH_END 61 > > #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ > MMIO_SPTE_GEN_LOW_START) > @@ -131,7 +134,7 @@ static_assert(!(SPTE_MMU_PRESENT_MASK & > #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) > > /* remember to adjust the comment above as well if you change these */ > -static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); > +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 10); > > #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) > #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) > @@ -208,6 +211,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; > /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ > static_assert(!(__REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); > static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE)); > +static_assert(!(__REMOVED_SPTE & SPTE_SHARED_MASK)); > > /* > * See above comment around __REMOVED_SPTE. REMOVED_SPTE is the actual > @@ -217,7 +221,12 @@ static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE)); > > static inline bool is_removed_spte(u64 spte) > { > - return spte == REMOVED_SPTE; > + return (spte & ~SPTE_SHARED_MASK) == REMOVED_SPTE; > +} > + > +static inline u64 spte_shared_mask(u64 spte) > +{ > + return spte & SPTE_SHARED_MASK; > } > > /* > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index fef6246086a8..4f279700b3cc 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -758,6 +758,11 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, > return 0; > } > > +static u64 shadow_nonpresent_spte(u64 old_spte) > +{ > + return SHADOW_NONPRESENT_VALUE | spte_shared_mask(old_spte); > +} > + > static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, > struct tdp_iter *iter) > { > @@ -791,7 +796,8 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, > * SHADOW_NONPRESENT_VALUE (which sets "suppress #VE" bit) so it > * can be set when EPT table entries are zapped. > */ > - __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); > + __kvm_tdp_mmu_write_spte(iter->sptep, > + shadow_nonpresent_spte(iter->old_spte)); > > return 0; > } > @@ -975,8 +981,11 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, > continue; > > if (!shared) > - tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); > - else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) > + tdp_mmu_set_spte(kvm, &iter, > + shadow_nonpresent_spte(iter.old_spte)); > + else if (tdp_mmu_set_spte_atomic( > + kvm, &iter, > + shadow_nonpresent_spte(iter.old_spte))) > goto retry; > } > } > @@ -1033,7 +1042,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > return false; > > __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, > - SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1, > + shadow_nonpresent_spte(old_spte), > + sp->gfn, sp->role.level + 1, > true, true, is_private_sp(sp)); > > return true; > @@ -1075,11 +1085,20 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > continue; > } > > + /* > + * SPTE_SHARED_MASK is stored as 4K granularity. The > + * information is lost if we delete upper level SPTE page. > + * TODO: support large page. > + */ > + if (kvm_gfn_shared_mask(kvm) && iter.level > PG_LEVEL_4K) > + continue; > + > if (!is_shadow_present_pte(iter.old_spte) || > !is_last_spte(iter.old_spte, iter.level)) > continue; > > - tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); > + tdp_mmu_set_spte(kvm, &iter, > + shadow_nonpresent_spte(iter.old_spte)); > flush = true; > } > > @@ -1195,18 +1214,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > gfn_t gfn_unalias = iter->gfn & ~kvm_gfn_shared_mask(vcpu->kvm); > > WARN_ON(sp->role.level != fault->goal_level); > + WARN_ON(is_private_sptep(iter->sptep) != fault->is_private); > > - /* TDX shared GPAs are no executable, enforce this for the SDV. */ > - if (kvm_gfn_shared_mask(vcpu->kvm) && !fault->is_private) > - pte_access &= ~ACC_EXEC_MASK; > + if (kvm_gfn_shared_mask(vcpu->kvm)) { > + if (fault->is_private) { > + /* > + * SPTE allows only RWX mapping. PFN can't be mapped it > + * as READONLY in GPA. > + */ > + if (fault->slot && !fault->map_writable) > + return RET_PF_RETRY; > + /* > + * This GPA is not allowed to map as private. Let > + * vcpu loop in page fault until other vcpu change it > + * by MapGPA hypercall. > + */ > + if (fault->slot && Please consider to merge this if into above "if (fault->slot) {}" > + spte_shared_mask(iter->old_spte)) > + return RET_PF_RETRY; > + } else { > + /* This GPA is not allowed to map as shared. */ > + if (fault->slot && > + !spte_shared_mask(iter->old_spte)) > + return RET_PF_RETRY; > + /* TDX shared GPAs are no executable, enforce this. */ > + pte_access &= ~ACC_EXEC_MASK; > + } > + } > > if (unlikely(!fault->slot)) > new_spte = make_mmio_spte(vcpu, gfn_unalias, pte_access); > - else > + else { > wrprot = make_spte(vcpu, sp, fault->slot, pte_access, > gfn_unalias, fault->pfn, iter->old_spte, > fault->prefetch, true, fault->map_writable, > &new_spte); > + if (spte_shared_mask(iter->old_spte)) > + new_spte |= SPTE_SHARED_MASK; > + } The if can be eliminated: new_spte |= spte_shared_mask(iter->old_spte); > > if (new_spte == iter->old_spte) > ret = RET_PF_SPURIOUS; > @@ -1509,7 +1554,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, > * invariant that the PFN of a present * leaf SPTE can never change. > * See __handle_changed_spte(). > */ > - tdp_mmu_set_spte(kvm, iter, SHADOW_NONPRESENT_VALUE); > + tdp_mmu_set_spte(kvm, iter, shadow_nonpresent_spte(iter->old_spte)); > > if (!pte_write(range->pte)) { > new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, > -- > 2.25.1 >