Re: [PATCH v8 042/103] KVM: x86/mmu: Add a private pointer to struct kvm_mmu_page

Yuan Yao <yuan.yao@xxxxxxxxxxxxxxx> · Thu, 1 Sep 2022 16:59:54 +0800

On Sun, Aug 07, 2022 at 03:01:27PM -0700, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> PTE entry) are used and their cost is expensive.
>
> When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> protected (encrypted) page table, allocate one more page to copy the
> protected page table for KVM MMU code to directly walk.  Resolve KVM page
> fault with the existing code, and do additional operations necessary for
> the protected page table.  To distinguish such cases, the existing KVM page
> table is called a shared page table (i.e. not associated with protected
> page table), and the page table with protected page table is called a
> private page table.  The relationship is depicted below.
>
> Add a private pointer to struct kvm_mmu_page for protected page table and
> add helper functions to allocate/initialize/free a protected page table
> page.
>
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>     shared PT root      private PT root          |    protected PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            private PT ----propagate----> protected PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest page
>                                                  |
>                            non-encrypted memory  |    encrypted memory
>                                                  |
> PT: page table
> - Shared PT is visible to KVM and it is used by CPU.
> - Protected PT is used by CPU but it is invisible to KVM.
> - Private PT is visible to KVM but not used by CPU.  It is used to
>   propagate PT change to the actual protected PT which is used by CPU.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu/mmu.c          |  9 ++++
>  arch/x86/kvm/mmu/mmu_internal.h | 73 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      |  2 +
>  4 files changed, 85 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 25835b8c4c12..e4ecf6b8ea0b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -743,6 +743,7 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> +	struct kvm_mmu_memory_cache mmu_private_sp_cache;
>
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0e5a6dcc4966..aa0819905874 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -669,6 +669,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_memory_cache *mc = &vcpu->arch.mmu_shadow_page_cache;
>  	int start, end, i, r;
>
> +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> +					       PT64_ROOT_MAX_LEVEL);
> +		if (r)
> +			return r;
> +	}
> +
>  	start = kvm_mmu_memory_cache_nr_free_objects(mc);
>  	r = kvm_mmu_topup_memory_cache(mc, PT64_ROOT_MAX_LEVEL);
>
> @@ -718,6 +725,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
> +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
>
> @@ -2162,6 +2170,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>
>  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> +	kvm_mmu_init_private_sp(sp, NULL);
>
>  	/*
>  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index c9446e4e16e3..d43c01e7e37b 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -82,6 +82,10 @@ struct kvm_mmu_page {
>  	 */
>  	u64 *shadowed_translation;
>
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +	/* associated private shadow page, e.g. SEPT page. */
> +	void *private_sp;
> +#endif
>  	/* Currently serving as active root */
>  	union {
>  		int root_count;
> @@ -153,6 +157,75 @@ static inline bool is_private_sptep(u64 *sptep)
>  	return is_private_sp(sptep_to_sp(sptep));
>  }
>
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return sp->private_sp;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +	sp->private_sp = private_sp;
> +}

How about "kvm_mmu_set_private_sp()" ? Because it just sets variable.
"init" means things more than just setting.

> +
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache *private_sp_cache,
> +	struct kvm_mmu_page *sp)
> +{
> +	/*
> +	 * vcpu == NULL means non-root SPT:
> +	 * vcpu == NULL is used to split a large SPT into smaller SPT.  Root SPT
> +	 * is not a large SPT.
> +	 */

Can we rely on sp->tdp_mmu_root_count to know it's root or not ?
Depends on vcpu looks odd, because it's often to split
huge pages in per-vcpu level, or things I missed here...

> +	bool is_root = vcpu &&
> +		vcpu->arch.root_mmu.root_role.level == sp->role.level;
> +
> +	if (vcpu)
> +		private_sp_cache = &vcpu->arch.mmu_private_sp_cache;

how about totally rely on caller for this, e.g caller to decide
uses per-vcpu cache or per-VM cache ?

> +	WARN_ON(!kvm_mmu_page_role_is_private(sp->role));

Please move to begin of this function because this should be
first step to check bad cases.

> +	if (is_root)
> +		/*
> +		 * Because TDX module assigns root Secure-EPT page and set it to
> +		 * Secure-EPTP when TD vcpu is created, secure page table for
> +		 * root isn't needed.
> +		 */
> +		sp->private_sp = NULL;
> +	else {
> +		sp->private_sp = kvm_mmu_memory_cache_alloc(private_sp_cache);
> +		/*
> +		 * Because mmu_private_sp_cache is topped up before staring kvm
> +		 * page fault resolving, the allocation above shouldn't fail.
> +		 */
> +		WARN_ON_ONCE(!sp->private_sp);
> +	}
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +	if (sp->private_sp)
> +		free_page((unsigned long)sp->private_sp);
> +}
> +#else
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return NULL;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +}
> +
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache *private_sp_cache,
> +	struct kvm_mmu_page *sp)
> +{
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +}
> +#endif
> +
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
>  {
>  	/*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 823c1ef807eb..3f5b019f9774 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -71,6 +71,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> +	kvm_mmu_free_private_sp(sp);
>  	free_page((unsigned long)sp->spt);
>  	kmem_cache_free(mmu_page_header_cache, sp);
>  }
> @@ -300,6 +301,7 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>  	sp->gfn = gfn;
>  	sp->ptep = sptep;
>  	sp->tdp_mmu_page = true;
> +	kvm_mmu_init_private_sp(sp, NULL);

This set the private sp allocated by kvm_mmu_alloc_private_sp(), I
think it's wrong here.

>
>  	trace_kvm_mmu_get_page(sp, true);
>  }
> --
> 2.25.1
>