On Wed, Nov 10, 2021 at 02:29:53PM -0800, Ben Gardon wrote: > When recursively handling a removed TDP page table, the TDP MMU will > flush the TLBs and queue an RCU callback to free the PT. If the original > change zapped a non-leaf SPTE at PG_LEVEL_1G or above, that change will > result in many unnecessary TLB flushes when one would suffice. Queue all > the PTs which need to be freed on a list and wait to queue RCU callbacks > to free them until after all the recursive callbacks are done. > > > Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx> > --- > arch/x86/kvm/mmu/tdp_mmu.c | 88 ++++++++++++++++++++++++++++++-------- > 1 file changed, 70 insertions(+), 18 deletions(-) > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 866c2b191e1e..5b31d046df78 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -220,7 +220,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) > > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared); > + bool shared, > + struct list_head *disconnected_sps); > > static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) > { > @@ -302,6 +303,11 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, > * @shared: This operation may not be running under the exclusive use > * of the MMU lock and the operation must synchronize with other > * threads that might be modifying SPTEs. > + * @disconnected_sps: If null, the TLBs will be flushed and the disconnected > + * TDP MMU page will be queued to be freed after an RCU > + * callback. If non-null the page will be added to the list > + * and flushing the TLBs and queueing an RCU callback to > + * free the page will be the caller's responsibility. > * > * Given a page table that has been removed from the TDP paging structure, > * iterates through the page table to clear SPTEs and free child page tables. > @@ -312,7 +318,8 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, > * early rcu_dereferences in the function. > */ > static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > - bool shared) > + bool shared, > { > struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); > int level = sp->role.level; > @@ -371,13 +378,16 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > } > handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, > old_child_spte, REMOVED_SPTE, level, > - shared); > + shared, disconnected_sps); > } > > - kvm_flush_remote_tlbs_with_address(kvm, base_gfn, > - KVM_PAGES_PER_HPAGE(level + 1)); > - > - call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + if (disconnected_sps) { > + list_add_tail(&sp->link, disconnected_sps); > + } else { > + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, > + KVM_PAGES_PER_HPAGE(level + 1)); > + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + } > } > > /** > @@ -391,13 +401,21 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, > * @shared: This operation may not be running under the exclusive use of > * the MMU lock and the operation must synchronize with other > * threads that might be modifying SPTEs. > + * @disconnected_sps: Only used if a page of page table memory has been > + * removed from the paging structure by this change. > + * If null, the TLBs will be flushed and the disconnected > + * TDP MMU page will be queued to be freed after an RCU > + * callback. If non-null the page will be added to the list > + * and flushing the TLBs and queueing an RCU callback to > + * free the page will be the caller's responsibility. > * > * Handle bookkeeping that might result from the modification of a SPTE. > * This function must be called for all TDP SPTE modifications. > */ > static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared) > + bool shared, > + struct list_head *disconnected_sps) > { > bool was_present = is_shadow_present_pte(old_spte); > bool is_present = is_shadow_present_pte(new_spte); > @@ -475,22 +493,39 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > */ > if (was_present && !was_leaf && (pfn_changed || !is_present)) > handle_removed_tdp_mmu_page(kvm, > - spte_to_child_pt(old_spte, level), shared); > + spte_to_child_pt(old_spte, level), shared, > + disconnected_sps); > } > > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > u64 old_spte, u64 new_spte, int level, > - bool shared) > + bool shared, struct list_head *disconnected_sps) > { > __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, > - shared); > + shared, disconnected_sps); > handle_changed_spte_acc_track(old_spte, new_spte, level); > handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, > new_spte, level); > } > > /* > - * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically > + * The TLBs must be flushed between the pages linked from disconnected_sps > + * being removed from the paging structure and this function being called. > + */ > +static void handle_disconnected_sps(struct kvm *kvm, > + struct list_head *disconnected_sps) handle_disconnected_sps() does a very specific task so I think we could go with a more specific function name to make the code more readable. How about free_sps_rcu() or call_rcu_free_sps()? > +{ > + struct kvm_mmu_page *sp; > + struct kvm_mmu_page *next; > + > + list_for_each_entry_safe(sp, next, disconnected_sps, link) { > + list_del(&sp->link); > + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); > + } > +} > + > +/* > + * __tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically > * and handle the associated bookkeeping. Do not mark the page dirty > * in KVM's dirty bitmaps. > * > @@ -500,9 +535,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, > * Returns: true if the SPTE was set, false if it was not. If false is returned, > * this function will have no side-effects. > */ > -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > - struct tdp_iter *iter, > - u64 new_spte) > +static inline bool __tdp_mmu_set_spte_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + u64 new_spte, > + struct list_head *disconnected_sps) > { > lockdep_assert_held_read(&kvm->mmu_lock); > > @@ -522,22 +558,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > return false; > > __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, > - new_spte, iter->level, true); > + new_spte, iter->level, true, disconnected_sps); > handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); > > return true; > } > > +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, > + struct tdp_iter *iter, > + u64 new_spte) > +{ > + return __tdp_mmu_set_spte_atomic(kvm, iter, new_spte, NULL); Why not leverage disconnected_sps here as well? Then you can remove the NULL case (and comments) from handle_removed_tdp_mmu_page. > +} > + > static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, > struct tdp_iter *iter) > { > + LIST_HEAD(disconnected_sps); > + > /* > * Freeze the SPTE by setting it to a special, > * non-present value. This will stop other threads from > * immediately installing a present entry in its place > * before the TLBs are flushed. > */ > - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) > + if (!__tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE, > + &disconnected_sps)) > return false; > > kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, > @@ -553,6 +599,8 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, > */ > WRITE_ONCE(*rcu_dereference(iter->sptep), 0); > > + handle_disconnected_sps(kvm, &disconnected_sps); > + > return true; > } > > @@ -577,6 +625,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > u64 new_spte, bool record_acc_track, > bool record_dirty_log) > { > + LIST_HEAD(disconnected_sps); > + > lockdep_assert_held_write(&kvm->mmu_lock); > > /* > @@ -591,7 +641,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); > > __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, > - new_spte, iter->level, false); > + new_spte, iter->level, false, &disconnected_sps); > if (record_acc_track) > handle_changed_spte_acc_track(iter->old_spte, new_spte, > iter->level); > @@ -599,6 +649,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, > iter->old_spte, new_spte, > iter->level); > + > + handle_disconnected_sps(kvm, &disconnected_sps); Where is the TLB flush for these disconnected_sps? > } > > static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, > -- > 2.34.0.rc0.344.g81b53c2807-goog >