On Fri, Mar 11, 2022, Yosry Ahmed wrote: > Count the pages used by KVM for page tables in pagetable memcg stats in > memory.stat. Why? Is it problematic to count these as kernel memory as opposed to page tables? What is gained/lost by tracking these as page table allocations? E.g. won't this pollute the information about the host page tables for the userpace process? When you asked about stats, I thought you meant KVM stats :-) > Most pages used for KVM page tables come from the mmu_shadow_page_cache, > in addition to a few allocations in __kvm_mmu_create() and > mmu_alloc_special_roots(). > > For allocations from the mmu_shadow_page_cache, the pages are counted as > pagetables when they are actually used by KVM (when > mmu_memory_cache_alloc_obj() is called), rather than when they are > allocated in the cache itself. In other words, pages sitting in the > cache are not counted as pagetables (they are still accounted as kernel > memory). > > The reason for this is to avoid the complexity and confusion of > incrementing the stats in the cache layer, while decerementing them > by the cache users when they are being freed (pages are freed directly > and not returned to the cache). > For the sake of simplicity, the stats are incremented and decremented by > the users of the cache when they get the page and when they free it. > > Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 7 +++++++ > arch/x86/kvm/mmu/mmu.c | 19 +++++++++++++++++++ > arch/x86/kvm/mmu/tdp_mmu.c | 4 ++++ > virt/kvm/kvm_main.c | 17 +++++++++++++++++ > 4 files changed, 47 insertions(+) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index f72e80178ffc..4a1dda2f56e1 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -458,6 +458,13 @@ struct kvm_mmu { > */ > u32 pkru_mask; > > + /* > + * After a page is allocated for any of these roots, > + * increment per-memcg pagetable stats by calling: > + * inc_lruvec_page_state(page, NR_PAGETABLE) > + * Before the page is freed, decrement the stats by calling: > + * dec_lruvec_page_state(page, NR_PAGETABLE). > + */ > u64 *pae_root; > u64 *pml4_root; > u64 *pml5_root; Eh, I would much prefer we don't bother counting these. They're barely page tables, more like necessary evils. And hopefully they'll be gone soon[*]. [*] https://lore.kernel.org/all/20220329153604.507475-1-jiangshanlai@xxxxxxxxx > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 3b8da8b0745e..5f87e1b0da91 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -1673,7 +1673,10 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) > MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); > hlist_del(&sp->hash_link); > list_del(&sp->link); > + > + dec_lruvec_page_state(virt_to_page(sp->spt), NR_PAGETABLE); I would strongly prefer to add new helpers to combine this accounting with KVM's existing accounting. E.g. for the legacy (not tdp_mmu.c) MMU code diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1361eb4599b4..c2cb642157cc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1668,6 +1668,18 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } +static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_mod_used_mmu_pages(kvm, 1); + inc_lruvec_page_state(..., NR_PAGETABLE); +} + +static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + kvm_mod_used_mmu_pages(kvm, -1); + dec_lruvec_page_state(..., NR_PAGETABLE); +} + static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); @@ -1723,7 +1735,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct */ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); + kvm_account_mmu_page(vcpu->kvm, sp); return sp; } @@ -2339,7 +2351,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); - kvm_mod_used_mmu_pages(kvm, -1); + kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root > free_page((unsigned long)sp->spt); > + There's a lot of spurious whitespace change in this patch. > if (!sp->role.direct) > free_page((unsigned long)sp->gfns); > kmem_cache_free(mmu_page_header_cache, sp); > @@ -1711,7 +1714,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct > struct kvm_mmu_page *sp; > > sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); > + More whitespace, though it should just naturally go away. > sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); > + inc_lruvec_page_state(virt_to_page(sp->spt), NR_PAGETABLE); > + > if (!direct) > sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); > set_page_private(virt_to_page(sp->spt), (unsigned long)sp); > @@ -3602,6 +3608,10 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) > mmu->pml4_root = pml4_root; > mmu->pml5_root = pml5_root; > > + /* Update per-memcg pagetable stats */ > + inc_lruvec_page_state(virt_to_page(pae_root), NR_PAGETABLE); > + inc_lruvec_page_state(virt_to_page(pml4_root), NR_PAGETABLE); > + inc_lruvec_page_state(virt_to_page(pml5_root), NR_PAGETABLE); > return 0; > > #ifdef CONFIG_X86_64 > @@ -5554,6 +5564,12 @@ static void free_mmu_pages(struct kvm_mmu *mmu) > { > if (!tdp_enabled && mmu->pae_root) > set_memory_encrypted((unsigned long)mmu->pae_root, 1); > + > + /* Update per-memcg pagetable stats */ > + dec_lruvec_page_state(virt_to_page(mmu->pae_root), NR_PAGETABLE); > + dec_lruvec_page_state(virt_to_page(mmu->pml4_root), NR_PAGETABLE); > + dec_lruvec_page_state(virt_to_page(mmu->pml5_root), NR_PAGETABLE); > + > free_page((unsigned long)mmu->pae_root); > free_page((unsigned long)mmu->pml4_root); > free_page((unsigned long)mmu->pml5_root); > @@ -5591,6 +5607,9 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) > if (!page) > return -ENOMEM; > > + /* Update per-memcg pagetable stats */ > + inc_lruvec_page_state(page, NR_PAGETABLE); > + > mmu->pae_root = page_address(page); > > /* > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index af60922906ef..ce8930fd0835 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -64,6 +64,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) > > static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) > { > + dec_lruvec_page_state(virt_to_page(sp->spt), NR_PAGETABLE); I'd prefer to do this in tdp_mmu_{,un}link_sp(), it saves having to add calls in all paths that allocate TDP MMU pages.