Re: [PATCH v7 2/5] KVM: x86: Virtualize CR3.LAM_{U48,U57}

"Huang, Kai" <kai.huang@xxxxxxxxx> · Wed, 12 Apr 2023 11:58:52 +0000

> 
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -729,6 +729,12 @@ struct kvm_vcpu_arch {
>  	unsigned long cr0_guest_owned_bits;
>  	unsigned long cr2;
>  	unsigned long cr3;
> +	/*
> +	 * Bits in CR3 used to enable certain features. These bits are allowed
> +	 * to be set in CR3 when vCPU supports the features. When shadow paging
> +	 * is used, these bits should be kept as they are in the shadow CR3.
> +	 */

I don't quite follow the second sentence.  Not sure what does "these bits should
be kept" mean.  

Those control bits are not active bits in guest's CR3 but all control bits that
guest is allowed to set to CR3. And those bits depends on guest's CPUID but not
whether guest is using shadow paging or not.

I think you can just remove the second sentence.

> +	u64 cr3_ctrl_bits;
>  	unsigned long cr4;
>  	unsigned long cr4_guest_owned_bits;
>  	unsigned long cr4_guest_rsvd_bits;
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index b1658c0de847..ef8e1b912d7d 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -42,6 +42,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
>  	return vcpu->arch.maxphyaddr;
>  }
>  
> +static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> +{
> +	return !((cr3 & vcpu->arch.reserved_gpa_bits) & ~vcpu->arch.cr3_ctrl_bits);
> +}
> +
>  static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
>  {
>  	return !(gpa & vcpu->arch.reserved_gpa_bits);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 168c46fd8dd1..29985eeb8e12 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -142,6 +142,11 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
>  	return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
>  }
>  
> +static inline u64 kvm_get_active_cr3_ctrl_bits(struct kvm_vcpu *vcpu)
> +{
> +	return kvm_read_cr3(vcpu) & vcpu->arch.cr3_ctrl_bits;
> +}
> +
>  static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
>  {
>  	u64 root_hpa = vcpu->arch.mmu->root.hpa;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c8ebe542c565..de2c51a0b611 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3732,7 +3732,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  	hpa_t root;
>  
>  	root_pgd = mmu->get_guest_pgd(vcpu);
> -	root_gfn = root_pgd >> PAGE_SHIFT;
> +	/*
> +	* The guest PGD has already been checked for validity, unconditionally
> +	* strip non-address bits when computing the GFN.
> +	*/

Don't quite follow this comment either.  Can we just say:

	/*
	 * Guest's PGD may contain additional control bits.  Mask them off
	 * to get the GFN.
	 */

Which explains why it has "non-address bits" and needs mask off?

> +	root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;

Or, should we explicitly mask vcpu->arch.cr3_ctrl_bits?  In this way, below
mmu_check_root() may potentially catch other invalid bits, but in practice there
should be no difference I guess. 

>  
>  	if (mmu_check_root(vcpu, root_gfn))
>  		return 1;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index cc58631e2336..c0479cbc2ca3 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -21,6 +21,7 @@ extern bool dbg;
>  #endif
>  
>  /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
> +#define __PT_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
>  #define __PT_LEVEL_SHIFT(level, bits_per_level)	\
>  	(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
>  #define __PT_INDEX(address, level, bits_per_level) \
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 57f0b75c80f9..88351ba04249 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -62,7 +62,7 @@
>  #endif
>  
>  /* Common logic, but per-type values.  These also need to be undefined. */
> -#define PT_BASE_ADDR_MASK	((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
> +#define PT_BASE_ADDR_MASK	((pt_element_t)__PT_BASE_ADDR_MASK)
>  #define PT_LVL_ADDR_MASK(lvl)	__PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
>  #define PT_LVL_OFFSET_MASK(lvl)	__PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
>  #define PT_INDEX(addr, lvl)	__PT_INDEX(addr, lvl, PT_LEVEL_BITS)
> @@ -324,6 +324,10 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>  	trace_kvm_mmu_pagetable_walk(addr, access);
>  retry_walk:
>  	walker->level = mmu->cpu_role.base.level;
> +	/*
> +	 * No need to mask cr3_ctrl_bits, gpte_to_gfn() will strip
> +	 * non-address bits.
> +	 */

I guess it will be helpful if we actually call out that guest's PGD may contain
control bits here.

Also, I am not sure whether it's better to just explicitly mask control bits off
here.

>  	pte           = mmu->get_guest_pgd(vcpu);
>  	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
>  
> 

[...]