From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> TDP MMU uses REMOVED_SPTE = 0x5a0ULL as special constant to indicate the intermediate value to indicate one thread is operating on it and the value should be semi-arbitrary value. For TDX (more correctly to use #VE), the value should include suppress #VE value which is shadow_init_value. Define SHADOW_REMOVED_SPTE as shadow_init_value | REMOVED_SPTE, and replace REMOVED_SPTE with SHADOW_REMOVED_SPTE to use suppress #VE bit properly for TDX. Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> --- arch/x86/kvm/mmu/spte.h | 14 ++++++++++++-- arch/x86/kvm/mmu/tdp_mmu.c | 23 ++++++++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index bde843bce878..e88f796724b4 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -194,7 +194,9 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a * non-present intermediate value. Other threads which encounter this value - * should not modify the SPTE. + * should not modify the SPTE. When TDX is enabled, shadow_init_value, which + * is "suppress #VE" bit set, is also set to removed SPTE, because TDX module + * always enables "EPT violation #VE". * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF @@ -207,9 +209,17 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +/* + * See above comment around REMOVED_SPTE. SHADOW_REMOVED_SPTE is the actual + * intermediate value set to the removed SPET. When TDX is enabled, it sets + * the "suppress #VE" bit, otherwise it's REMOVED_SPTE. + */ +extern u64 __read_mostly shadow_init_value; +#define SHADOW_REMOVED_SPTE (shadow_init_value | REMOVED_SPTE) + static inline bool is_removed_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == SHADOW_REMOVED_SPTE; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ebd0a02620e8..b6ec2f112c26 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -338,7 +338,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, * value to the removed SPTE value. */ for (;;) { - old_child_spte = xchg(sptep, REMOVED_SPTE); + old_child_spte = xchg(sptep, SHADOW_REMOVED_SPTE); if (!is_removed_spte(old_child_spte)) break; cpu_relax(); @@ -365,10 +365,10 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, * the two branches consistent and simplifies * the function. */ - WRITE_ONCE(*sptep, REMOVED_SPTE); + WRITE_ONCE(*sptep, SHADOW_REMOVED_SPTE); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level, + old_child_spte, SHADOW_REMOVED_SPTE, level, shared); } @@ -537,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic(kvm, iter, SHADOW_REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -550,8 +550,16 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present * to non-present. + * + * Set non-present value to shadow_init_value, rather than 0. + * It is because when TDX is enabled, TDX module always + * enables "EPT-violation #VE", so KVM needs to set + * "suppress #VE" bit in EPT table entries, in order to get + * real EPT violation, rather than TDVMCALL. KVM sets + * shadow_init_value (which sets "suppress #VE" bit) so it + * can be set when EPT table entries are zapped. */ - WRITE_ONCE(*rcu_dereference(iter->sptep), 0); + WRITE_ONCE(*rcu_dereference(iter->sptep), shadow_init_value); return true; } @@ -748,7 +756,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; if (!shared) { - tdp_mmu_set_spte(kvm, &iter, 0); + /* see comments in tdp_mmu_zap_spte_atomic() */ + tdp_mmu_set_spte(kvm, &iter, shadow_init_value); flush = true; } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* @@ -1135,7 +1144,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, * invariant that the PFN of a present * leaf SPTE can never change. * See __handle_changed_spte(). */ - tdp_mmu_set_spte(kvm, iter, 0); + tdp_mmu_set_spte(kvm, iter, shadow_init_value); if (!pte_write(range->pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, -- 2.25.1