Re: [PATCH RFC] KVM: x86/mmu: Remove KVM memory shrinker

Sean Christopherson <seanjc@xxxxxxxxxx> · Thu, 5 May 2022 16:30:55 +0000

On Tue, May 03, 2022, Vipin Sharma wrote:
> KVM memory shrinker is only used in the shadow paging. Most of the L1
> guests are backed by TDP (Two Dimensional Paging) which do not use the
> shrinker, only L2 guests are backed by shadow paging.

Nit, the TDP MMU doesn't use the shrinker, but legacy MMU, which supports TDP,
does.

> KVM memory shrinker can cause guests performance to degrade if any other
> process (VM or non-VM) in the same or different cgroup in kernel causes
> memory shrinker to run.

Ah, but digging into this aspect reveals that per-memcg shrinkers were added in
2015 by commit cb731d6c62bb ("vmscan: per memory cgroup slab shrinkers").  I
haven't dug too deep, but presumably it wouldn't be all that difficult to support
SHRINKER_MEMCG_AWARE in KVM.  That's quite tempting to support as it would/could
guard against an unintentional DoS of sorts against L1 from L2, e.g. if L1 doesn't
cap the number of TDP SPTEs it creates (*cough* TDP MMU *cough*) and ends up
creating a large number of SPTEs for L2.  IIUC, that sort of scenario was the
primary motivation for commit 2de4085cccea ("KVM: x86/MMU: Recursively zap nested
TDP SPs when zapping last/only parent").

> The KVM memory shrinker was introduced in 2008,
> commit 3ee16c814511 ("KVM: MMU: allow the vm to shrink the kvm mmu
> shadow caches"), each invocation of shrinker only released 1 shadow page
> in 1 VM. This behavior was not effective until the batch zapping commit
> was added in 2020, commit ebdb292dac79 ("KVM: x86/mmu: Batch zap MMU
> pages when shrinking the slab"), which zaps multiple pages but still in
> 1 VM for each shrink invocation. Overall, this feature existed for many
> years without providing meaningful benefit.
> 
> Signed-off-by: Vipin Sharma <vipinsh@xxxxxxxxxx>
> ---
>  arch/x86/kvm/mmu/mmu.c | 99 ++----------------------------------------
>  1 file changed, 3 insertions(+), 96 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4e8d546431eb..80618c847ce2 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -178,7 +178,6 @@ struct kvm_shadow_walk_iterator {
>  
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
> -static struct percpu_counter kvm_total_used_mmu_pages;
>  
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
> @@ -1658,16 +1657,9 @@ static int is_empty_shadow_page(u64 *spt)
>  }
>  #endif
>  
> -/*
> - * This value is the sum of all of the kvm instances's
> - * kvm->arch.n_used_mmu_pages values.  We need a global,
> - * aggregate version in order to make the slab shrinker
> - * faster
> - */
> -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> +static inline void kvm_used_mmu_pages(struct kvm *kvm, long nr)

This rename is unnecessary, and I much prefer the existing name, kvm_used_mmu_pages()
sounds like an accessor to get the number of used pages.

>  {
>  	kvm->arch.n_used_mmu_pages += nr;
> -	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
>  }
>  
>  static void kvm_mmu_free_page(struct kvm_mmu_page *sp)

...

> -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> -{
> -	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> -}

zapped_obsolete_pages can be dropped, its sole purposed was to expose those pages
to the shrinker.

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c59fea4bdb6e..15b71de6f6fe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1065,7 +1065,6 @@ struct kvm_arch {
        u8 mmu_valid_gen;
        struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
        struct list_head active_mmu_pages;
-       struct list_head zapped_obsolete_pages;
        struct list_head lpage_disallowed_mmu_pages;
        struct kvm_page_track_notifier_node mmu_sp_tracker;
        struct kvm_page_track_notifier_head track_notifier_head;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 51443a8e779a..299c9297418e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5657,6 +5657,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
        struct kvm_mmu_page *sp, *node;
        int nr_zapped, batch = 0;
+       LIST_HEAD(zapped_pages);

 restart:
        list_for_each_entry_safe_reverse(sp, node,
@@ -5688,8 +5689,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
                        goto restart;
                }

-               if (__kvm_mmu_prepare_zap_page(kvm, sp,
-                               &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
+               if (__kvm_mmu_prepare_zap_page(kvm, sp, &zapped_pages, &nr_zapped)) {
                        batch += nr_zapped;
                        goto restart;
                }
@@ -5704,7 +5704,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
         * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
         * running with an obsolete MMU.
         */
-       kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+       kvm_mmu_commit_zap_page(kvm, &zapped_pages);
 }

 /*
@@ -5780,7 +5780,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
        int r;

        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-       INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
        INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
        spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);