Re: [PATCH v10 016/108] KVM: TDX: create/destroy VM structure

Isaku Yamahata <isaku.yamahata@xxxxxxxxx> · Thu, 15 Dec 2022 11:59:11 -0800



On Wed, Nov 23, 2022 at 12:36:59PM +0200,
Zhi Wang <zhi.wang.linux@xxxxxxxxx> wrote:

> On Sat, 29 Oct 2022 23:22:17 -0700
> isaku.yamahata@xxxxxxxxx wrote:
> 
> > From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> > 
> > As the first step to create TDX guest, create/destroy VM struct.
> > Assign TDX private Host Key ID (HKID) to the TDX guest for memory
> > encryption and allocate extra pages for the TDX guest. On
> > destruction, free allocated pages, and HKID.
> > 
> > Before tearing down private page tables, TDX requires some resources
> > of the guest TD to be destroyed (i.e. keyID must have been reclaimed,
> > etc).  Add flush_shadow_all_private callback before tearing down
> > private page tables for it.
> > 
> > Add a second kvm_x86_ops hook in kvm_arch_destroy_vm() to support
> > TDX's destruction path, which needs to first put the VM into a
> > teardown state, then free per-vCPU resources, and finally free per-VM
> > resources.
> > 
> > Co-developed-by: Kai Huang <kai.huang@xxxxxxxxx>
> > Signed-off-by: Kai Huang <kai.huang@xxxxxxxxx>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> > ---
> >  arch/x86/include/asm/kvm-x86-ops.h |   2 +
> >  arch/x86/include/asm/kvm_host.h    |   2 +
> >  arch/x86/kvm/vmx/main.c            |  34 ++-
> >  arch/x86/kvm/vmx/tdx.c             | 411
> > +++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h             |
> > 2 + arch/x86/kvm/vmx/x86_ops.h         |  11 +
> >  arch/x86/kvm/x86.c                 |   8 +
> >  7 files changed, 467 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h
> > b/arch/x86/include/asm/kvm-x86-ops.h index 8a5c5ae70bc5..3a29a6b31ee8
> > 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -21,7 +21,9 @@ KVM_X86_OP(has_emulated_msr)
> >  KVM_X86_OP(vcpu_after_set_cpuid)
> >  KVM_X86_OP(is_vm_type_supported)
> >  KVM_X86_OP(vm_init)
> > +KVM_X86_OP_OPTIONAL(flush_shadow_all_private)
> >  KVM_X86_OP_OPTIONAL(vm_destroy)
> > +KVM_X86_OP_OPTIONAL(vm_free)
> >  KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
> >  KVM_X86_OP(vcpu_create)
> >  KVM_X86_OP(vcpu_free)
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index 2a41a93a80f3..2870155ce6fb
> > 100644 --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1472,7 +1472,9 @@ struct kvm_x86_ops {
> >  	bool (*is_vm_type_supported)(unsigned long vm_type);
> >  	unsigned int vm_size;
> >  	int (*vm_init)(struct kvm *kvm);
> > +	void (*flush_shadow_all_private)(struct kvm *kvm);
> >  	void (*vm_destroy)(struct kvm *kvm);
> > +	void (*vm_free)(struct kvm *kvm);
> >  
> >  	/* Create, but do not attach this VCPU */
> >  	int (*vcpu_precreate)(struct kvm *kvm);
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 0900ff2f2390..d01a946a18cf 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -29,18 +29,44 @@ static __init int vt_hardware_setup(void)
> >  	return 0;
> >  }
> >  
> > +static void vt_hardware_unsetup(void)
> > +{
> > +	tdx_hardware_unsetup();
> > +	vmx_hardware_unsetup();
> > +}
> > +
> >  static int vt_vm_init(struct kvm *kvm)
> >  {
> >  	if (is_td(kvm))
> > -		return -EOPNOTSUPP;	/* Not ready to create
> > guest TD yet. */
> > +		return tdx_vm_init(kvm);
> >  
> >  	return vmx_vm_init(kvm);
> >  }
> >  
> > +static void vt_flush_shadow_all_private(struct kvm *kvm)
> > +{
> > +	if (is_td(kvm))
> > +		return tdx_mmu_release_hkid(kvm);
> > +}
> > +
> > +static void vt_vm_destroy(struct kvm *kvm)
> > +{
> > +	if (is_td(kvm))
> > +		return;
> > +
> > +	vmx_vm_destroy(kvm);
> > +}
> > +
> > +static void vt_vm_free(struct kvm *kvm)
> > +{
> > +	if (is_td(kvm))
> > +		return tdx_vm_free(kvm);
> > +}
> > +
> >  struct kvm_x86_ops vt_x86_ops __initdata = {
> >  	.name = "kvm_intel",
> >  
> > -	.hardware_unsetup = vmx_hardware_unsetup,
> > +	.hardware_unsetup = vt_hardware_unsetup,
> >  	.check_processor_compatibility =
> > vmx_check_processor_compatibility, 
> >  	.hardware_enable = vmx_hardware_enable,
> > @@ -50,7 +76,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> >  	.is_vm_type_supported = vt_is_vm_type_supported,
> >  	.vm_size = sizeof(struct kvm_vmx),
> >  	.vm_init = vt_vm_init,
> > -	.vm_destroy = vmx_vm_destroy,
> > +	.flush_shadow_all_private = vt_flush_shadow_all_private,
> > +	.vm_destroy = vt_vm_destroy,
> > +	.vm_free = vt_vm_free,
> >  
> >  	.vcpu_precreate = vmx_vcpu_precreate,
> >  	.vcpu_create = vmx_vcpu_create,
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 530e72f85762..ec88dde0d300 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -32,6 +32,401 @@ struct tdx_capabilities {
> >  /* Capabilities of KVM + the TDX module. */
> >  static struct tdx_capabilities tdx_caps;
> >  
> > +/*
> > + * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
> > + * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a
> > global lock
> > + * internally in TDX module.  If failed, TDX_OPERAND_BUSY is
> > returned without
> > + * spinning or waiting due to a constraint on execution time.  It's
> > caller's
> > + * responsibility to avoid race (or retry on TDX_OPERAND_BUSY).  Use
> > this mutex
> > + * to avoid race in TDX module because the kernel knows better about
> > scheduling.
> > + */
> > +static DEFINE_MUTEX(tdx_lock);
> > +static struct mutex *tdx_mng_key_config_lock;
> > +
> > +static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> > +{
> > +	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> > +}
> > +
> > +static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> > +{
> > +	return kvm_tdx->tdr.added;
> > +}
> > +
> > +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
> > +{
> > +	tdx_keyid_free(kvm_tdx->hkid);
> > +	kvm_tdx->hkid = -1;
> > +}
> > +
> > +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> > +{
> > +	return kvm_tdx->hkid > 0;
> > +}
> > +
> > +static void tdx_clear_page(unsigned long page)
> > +{
> > +	const void *zero_page = (const void *)
> > __va(page_to_phys(ZERO_PAGE(0)));
> > +	unsigned long i;
> > +
> > +	/*
> > +	 * Zeroing the page is only necessary for systems with
> > MKTME-i:
> > +	 * when re-assign one page from old keyid to a new keyid,
> > MOVDIR64B is
> > +	 * required to clear/write the page with new keyid to
> > prevent integrity
> > +	 * error when read on the page with new keyid.
> > +	 *
> > +	 * The cache line could be poisoned (even without MKTME-i),
> > clear the
> > +	 * poison bit.
> > +	 */
> > +	for (i = 0; i < PAGE_SIZE; i += 64)
> > +		movdir64b((void *)(page + i), zero_page);
> 
> In patch 11, the clflush is used on the movdir64b target. Should we
> also use the clflush here?
> 
> I suppose the clflush is to prevent the unexpected cacheline writeback
> after the movdir64b.

No, clflush doesn't work.  The cache with HKID set needs to be flushed and
(potential) poison needs to be cleared. In host OS/VMM, clflush flushes on cache
with HKID=0 (or HKID of MKTME). clflush doesn't clear poison.
-- 
Isaku Yamahata <isaku.yamahata@xxxxxxxxx>