On Tue, 2024-05-14 at 17:59 -0700, Rick Edgecombe wrote: > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > - u64 old_spte, u64 new_spte, int level, > - bool shared) > + u64 old_spte, u64 new_spte, > + union kvm_mmu_page_role role, bool shared) > { > + bool is_private = kvm_mmu_page_role_is_private(role); > + int level = role.level; > bool was_present = is_shadow_present_pte(old_spte); > bool is_present = is_shadow_present_pte(new_spte); > bool was_leaf = was_present && is_last_spte(old_spte, level); > bool is_leaf = is_present && is_last_spte(new_spte, level); > - bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); > + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); > + kvm_pfn_t new_pfn = spte_to_pfn(new_spte); > + bool pfn_changed = old_pfn != new_pfn; > > WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); > WARN_ON_ONCE(level < PG_LEVEL_4K); > @@ -513,7 +636,7 @@ static void handle_changed_spte(struct kvm *kvm, int > as_id, gfn_t gfn, > > if (was_leaf && is_dirty_spte(old_spte) && > (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) > - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); > + kvm_set_pfn_dirty(old_pfn); > > /* > * Recursively handle child PTs if the change removed a subtree from > @@ -522,15 +645,21 @@ static void handle_changed_spte(struct kvm *kvm, int > as_id, gfn_t gfn, > * pages are kernel allocations and should never be migrated. > */ > if (was_present && !was_leaf && > - (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) > + (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) { > + KVM_BUG_ON(is_private != > is_private_sptep(spte_to_child_pt(old_spte, level)), > + kvm); > handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), > shared); > + } > + > + if (is_private && !is_present) > + handle_removed_private_spte(kvm, gfn, old_spte, new_spte, > role.level); I'm a little bothered by the asymmetry of where the mirrored hooks get called between setting and zapping PTEs. Tracing through the code, the relevent operations that are needed for TDX are: 1. tdp_mmu_iter_set_spte() from tdp_mmu_zap_leafs() and __tdp_mmu_zap_root() 2. tdp_mmu_set_spte_atomic() is used for mapping, linking (1) is a simple case because the mmu_lock is held for writes. It updates the mirror root like normal, then has extra logic to call out to update the S-EPT. (2) on the other hand just has the read lock, so it has to do the whole operation in a special way. First set REMOVED_SPTE, then update the private copy, then write to the mirror page tables. It can't get stuffed into handle_changed_spte() because it has to write REMOVED_SPTE first. In some ways it makes sense to update the S-EPT. Despite claiming "handle_changed_spte() only updates stats.", it does some updating of other PTEs based on the current PTE change. Which is pretty similar to what the mirrored PTEs are doing. But we can't really do the setting of present PTEs because of the REMOVED_SPTE stuff. So we could only make it more symmetrical by moving the S-EPT ops out of handle_changed_spte() and manually call it in the two places relevant for TDX, like the below. diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e966986bb9f2..c9ddb1c2a550 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -438,6 +438,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, REMOVED_SPTE, level); + + if (is_mirror_sp(sp)) + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, old_spte, REMOVED_SPTE, sp->role, shared); @@ -667,9 +670,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } - if (is_mirror && !is_present) - reflect_removed_spte(kvm, gfn, old_spte, new_spte, role.level); - if (was_leaf && is_accessed_spte(old_spte) && (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); @@ -839,6 +839,9 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, new_spte, level), kvm); } + if (is_mirror_sptep(sptep)) + reflect_removed_spte(kvm, gfn, old_spte, REMOVED_SPTE, level); + role = sptep_to_sp(sptep)->role; role.level = level; handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false); Otherwise, we could move the "set present" mirroring operations into handle_changed_spte(), and have some earlier conditional logic do the REMOVED_SPTE parts. It starts to become more scattered. Anyway, it's just a code clarity thing arising from having hard time explaining the design in the log. Any opinions? A separate but related comment is below. > > if (was_leaf && is_accessed_spte(old_spte) && > (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) > kvm_set_pfn_accessed(spte_to_pfn(old_spte)); > } > > @@ -648,6 +807,8 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, > static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, > u64 old_spte, u64 new_spte, gfn_t gfn, int level) > { > + union kvm_mmu_page_role role; > + > lockdep_assert_held_write(&kvm->mmu_lock); > > /* > @@ -660,8 +821,16 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, > tdp_ptep_t sptep, > WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); > > old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); > + if (is_private_sptep(sptep) && !is_removed_spte(new_spte) && > + is_shadow_present_pte(new_spte)) { > + /* Because write spin lock is held, no race. It should > success. */ > + KVM_BUG_ON(__set_private_spte_present(kvm, sptep, gfn, > old_spte, > + new_spte, level), kvm); > + } Based on the above enumeration, I don't see how this hunk gets used. > > - handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, > false); > + role = sptep_to_sp(sptep)->role; > + role.level = level; > + handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, role, false); > return old_spte; > } >