Re: [PATCH] KVM: x86/mmu: Remove the defunct update_pte() paging hook

Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx> · Sat, 30 Jan 2021 11:00:52 +0800

Thanks a lot for the patch, Sean.

I know this has been queued for quite a while. But I just realized I have
another question of kvm_mmu_pte_write():

> Remove the update_pte() shadow paging logic, which was obsoleted by
> commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never
> removed.  As pointed out by Yu, KVM never write protects leaf page
> tables for the purposes of shadow paging, and instead marks their
> associated shadow page as unsync so that the guest can write PTEs at
> will.
> 
> The update_pte() path, which predates the unsync logic, optimizes COW
> scenarios by refreshing leaf SPTEs when they are written, as opposed to
> zapping the SPTE, restarting the guest, and installing the new SPTE on
> the subsequent fault.  Since KVM no longer write-protects leaf page
> tables, update_pte() is unreachable and can be dropped.
> 
> Reported-by: Yu Zhang <yu.c.zhang@xxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 --
>  arch/x86/kvm/mmu/mmu.c          | 49 ++-------------------------------
>  arch/x86/kvm/x86.c              |  1 -
>  3 files changed, 2 insertions(+), 51 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..ed575c5655dd 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -358,8 +358,6 @@ struct kvm_mmu {
>  	int (*sync_page)(struct kvm_vcpu *vcpu,
>  			 struct kvm_mmu_page *sp);
>  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
> -	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> -			   u64 *spte, const void *pte);
>  	hpa_t root_hpa;
>  	gpa_t root_pgd;
>  	union kvm_mmu_role mmu_role;
> @@ -1031,7 +1029,6 @@ struct kvm_arch {
>  struct kvm_vm_stat {
>  	ulong mmu_shadow_zapped;
>  	ulong mmu_pte_write;
> -	ulong mmu_pte_updated;
>  	ulong mmu_pde_zapped;
>  	ulong mmu_flooded;
>  	ulong mmu_recycled;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..3a2c25852b1f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1723,13 +1723,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> -static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
> -				 struct kvm_mmu_page *sp, u64 *spte,
> -				 const void *pte)
> -{
> -	WARN_ON(1);
> -}
> -
>  #define KVM_PAGE_ARRAY_NR 16
>  
>  struct kvm_mmu_pages {
> @@ -3813,7 +3806,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
>  	context->gva_to_gpa = nonpaging_gva_to_gpa;
>  	context->sync_page = nonpaging_sync_page;
>  	context->invlpg = NULL;
> -	context->update_pte = nonpaging_update_pte;
>  	context->root_level = 0;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->direct_map = true;
> @@ -4395,7 +4387,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
>  	context->gva_to_gpa = paging64_gva_to_gpa;
>  	context->sync_page = paging64_sync_page;
>  	context->invlpg = paging64_invlpg;
> -	context->update_pte = paging64_update_pte;
>  	context->shadow_root_level = level;
>  	context->direct_map = false;
>  }
> @@ -4424,7 +4415,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
>  	context->gva_to_gpa = paging32_gva_to_gpa;
>  	context->sync_page = paging32_sync_page;
>  	context->invlpg = paging32_invlpg;
> -	context->update_pte = paging32_update_pte;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->direct_map = false;
>  }
> @@ -4506,7 +4496,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>  	context->page_fault = kvm_tdp_page_fault;
>  	context->sync_page = nonpaging_sync_page;
>  	context->invlpg = NULL;
> -	context->update_pte = nonpaging_update_pte;
>  	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
>  	context->direct_map = true;
>  	context->get_guest_pgd = get_cr3;
> @@ -4678,7 +4667,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
>  	context->gva_to_gpa = ept_gva_to_gpa;
>  	context->sync_page = ept_sync_page;
>  	context->invlpg = ept_invlpg;
> -	context->update_pte = ept_update_pte;
>  	context->root_level = level;
>  	context->direct_map = false;
>  	context->mmu_role.as_u64 = new_role.as_u64;
> @@ -4826,19 +4814,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_unload);
>  
> -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
> -				  struct kvm_mmu_page *sp, u64 *spte,
> -				  const void *new)
> -{
> -	if (sp->role.level != PG_LEVEL_4K) {
> -		++vcpu->kvm->stat.mmu_pde_zapped;
> -		return;
> -        }
> -
> -	++vcpu->kvm->stat.mmu_pte_updated;
> -	vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
> -}
> -
>  static bool need_remote_flush(u64 old, u64 new)
>  {
>  	if (!is_shadow_present_pte(old))
> @@ -4954,22 +4929,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
>  	return spte;
>  }
>  
> -/*
> - * Ignore various flags when determining if a SPTE can be immediately
> - * overwritten for the current MMU.
> - *  - level: explicitly checked in mmu_pte_write_new_pte(), and will never
> - *    match the current MMU role, as MMU's level tracks the root level.
> - *  - access: updated based on the new guest PTE
> - *  - quadrant: handled by get_written_sptes()
> - *  - invalid: always false (loop only walks valid shadow pages)
> - */
> -static const union kvm_mmu_page_role role_ign = {
> -	.level = 0xf,
> -	.access = 0x7,
> -	.quadrant = 0x3,
> -	.invalid = 0x1,
> -};
> -
>  static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  			      const u8 *new, int bytes,
>  			      struct kvm_page_track_notifier_node *node)
> @@ -5020,14 +4979,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  
>  		local_flush = true;
>  		while (npte--) {
> -			u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
> -
>  			entry = *spte;
>  			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
> -			if (gentry &&
> -			    !((sp->role.word ^ base_role) & ~role_ign.word) &&
> -			    rmap_can_add(vcpu))
> -				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
> +			if (gentry && sp->role.level != PG_LEVEL_4K)

I am wondering, if there's any chance the sp->role.level would be PG_LEVEL_4K
in kvm_mmu_pte_write()? My previous understanding was that, since the gfn of
guest leaf page tables are never page tracked, the sp here shall only be with
level greater than PG_LEVEL_4K. Did I miss anything here? Thanks! :)

B.R.
Yu

> +				++vcpu->kvm->stat.mmu_pde_zapped;
>  			if (need_remote_flush(entry, *spte))
>  				remote_flush = true;
>  			++spte;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a480804ae27a..d9f5d9acccc1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -233,7 +233,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
>  	VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
>  	VM_STAT("mmu_pte_write", mmu_pte_write),
> -	VM_STAT("mmu_pte_updated", mmu_pte_updated),
>  	VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
>  	VM_STAT("mmu_flooded", mmu_flooded),
>  	VM_STAT("mmu_recycled", mmu_recycled),
> -- 
> 2.30.0.284.gd98b1dd5eaa7-goog
>