On Wed, Nov 23, 2022 at 12:36:59PM +0200, Zhi Wang <zhi.wang.linux@xxxxxxxxx> wrote: > On Sat, 29 Oct 2022 23:22:17 -0700 > isaku.yamahata@xxxxxxxxx wrote: > > > From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> > > > > As the first step to create TDX guest, create/destroy VM struct. > > Assign TDX private Host Key ID (HKID) to the TDX guest for memory > > encryption and allocate extra pages for the TDX guest. On > > destruction, free allocated pages, and HKID. > > > > Before tearing down private page tables, TDX requires some resources > > of the guest TD to be destroyed (i.e. keyID must have been reclaimed, > > etc). Add flush_shadow_all_private callback before tearing down > > private page tables for it. > > > > Add a second kvm_x86_ops hook in kvm_arch_destroy_vm() to support > > TDX's destruction path, which needs to first put the VM into a > > teardown state, then free per-vCPU resources, and finally free per-VM > > resources. > > > > Co-developed-by: Kai Huang <kai.huang@xxxxxxxxx> > > Signed-off-by: Kai Huang <kai.huang@xxxxxxxxx> > > Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > > --- > > arch/x86/include/asm/kvm-x86-ops.h | 2 + > > arch/x86/include/asm/kvm_host.h | 2 + > > arch/x86/kvm/vmx/main.c | 34 ++- > > arch/x86/kvm/vmx/tdx.c | 411 > > +++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | > > 2 + arch/x86/kvm/vmx/x86_ops.h | 11 + > > arch/x86/kvm/x86.c | 8 + > > 7 files changed, 467 insertions(+), 3 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h > > b/arch/x86/include/asm/kvm-x86-ops.h index 8a5c5ae70bc5..3a29a6b31ee8 > > 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h > > +++ b/arch/x86/include/asm/kvm-x86-ops.h > > @@ -21,7 +21,9 @@ KVM_X86_OP(has_emulated_msr) > > KVM_X86_OP(vcpu_after_set_cpuid) > > KVM_X86_OP(is_vm_type_supported) > > KVM_X86_OP(vm_init) > > +KVM_X86_OP_OPTIONAL(flush_shadow_all_private) > > KVM_X86_OP_OPTIONAL(vm_destroy) > > +KVM_X86_OP_OPTIONAL(vm_free) > > KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) > > KVM_X86_OP(vcpu_create) > > KVM_X86_OP(vcpu_free) > > diff --git a/arch/x86/include/asm/kvm_host.h > > b/arch/x86/include/asm/kvm_host.h index 2a41a93a80f3..2870155ce6fb > > 100644 --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1472,7 +1472,9 @@ struct kvm_x86_ops { > > bool (*is_vm_type_supported)(unsigned long vm_type); > > unsigned int vm_size; > > int (*vm_init)(struct kvm *kvm); > > + void (*flush_shadow_all_private)(struct kvm *kvm); > > void (*vm_destroy)(struct kvm *kvm); > > + void (*vm_free)(struct kvm *kvm); > > > > /* Create, but do not attach this VCPU */ > > int (*vcpu_precreate)(struct kvm *kvm); > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > > index 0900ff2f2390..d01a946a18cf 100644 > > --- a/arch/x86/kvm/vmx/main.c > > +++ b/arch/x86/kvm/vmx/main.c > > @@ -29,18 +29,44 @@ static __init int vt_hardware_setup(void) > > return 0; > > } > > > > +static void vt_hardware_unsetup(void) > > +{ > > + tdx_hardware_unsetup(); > > + vmx_hardware_unsetup(); > > +} > > + > > static int vt_vm_init(struct kvm *kvm) > > { > > if (is_td(kvm)) > > - return -EOPNOTSUPP; /* Not ready to create > > guest TD yet. */ > > + return tdx_vm_init(kvm); > > > > return vmx_vm_init(kvm); > > } > > > > +static void vt_flush_shadow_all_private(struct kvm *kvm) > > +{ > > + if (is_td(kvm)) > > + return tdx_mmu_release_hkid(kvm); > > +} > > + > > +static void vt_vm_destroy(struct kvm *kvm) > > +{ > > + if (is_td(kvm)) > > + return; > > + > > + vmx_vm_destroy(kvm); > > +} > > + > > +static void vt_vm_free(struct kvm *kvm) > > +{ > > + if (is_td(kvm)) > > + return tdx_vm_free(kvm); > > +} > > + > > struct kvm_x86_ops vt_x86_ops __initdata = { > > .name = "kvm_intel", > > > > - .hardware_unsetup = vmx_hardware_unsetup, > > + .hardware_unsetup = vt_hardware_unsetup, > > .check_processor_compatibility = > > vmx_check_processor_compatibility, > > .hardware_enable = vmx_hardware_enable, > > @@ -50,7 +76,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = { > > .is_vm_type_supported = vt_is_vm_type_supported, > > .vm_size = sizeof(struct kvm_vmx), > > .vm_init = vt_vm_init, > > - .vm_destroy = vmx_vm_destroy, > > + .flush_shadow_all_private = vt_flush_shadow_all_private, > > + .vm_destroy = vt_vm_destroy, > > + .vm_free = vt_vm_free, > > > > .vcpu_precreate = vmx_vcpu_precreate, > > .vcpu_create = vmx_vcpu_create, > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 530e72f85762..ec88dde0d300 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -32,6 +32,401 @@ struct tdx_capabilities { > > /* Capabilities of KVM + the TDX module. */ > > static struct tdx_capabilities tdx_caps; > > > > +/* > > + * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB, > > + * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a > > global lock > > + * internally in TDX module. If failed, TDX_OPERAND_BUSY is > > returned without > > + * spinning or waiting due to a constraint on execution time. It's > > caller's > > + * responsibility to avoid race (or retry on TDX_OPERAND_BUSY). Use > > this mutex > > + * to avoid race in TDX module because the kernel knows better about > > scheduling. > > + */ > > +static DEFINE_MUTEX(tdx_lock); > > +static struct mutex *tdx_mng_key_config_lock; > > + > > +static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid) > > +{ > > + return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits); > > +} > > + > > +static inline bool is_td_created(struct kvm_tdx *kvm_tdx) > > +{ > > + return kvm_tdx->tdr.added; > > +} > > + > > +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) > > +{ > > + tdx_keyid_free(kvm_tdx->hkid); > > + kvm_tdx->hkid = -1; > > +} > > + > > +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) > > +{ > > + return kvm_tdx->hkid > 0; > > +} > > + > > +static void tdx_clear_page(unsigned long page) > > +{ > > + const void *zero_page = (const void *) > > __va(page_to_phys(ZERO_PAGE(0))); > > + unsigned long i; > > + > > + /* > > + * Zeroing the page is only necessary for systems with > > MKTME-i: > > + * when re-assign one page from old keyid to a new keyid, > > MOVDIR64B is > > + * required to clear/write the page with new keyid to > > prevent integrity > > + * error when read on the page with new keyid. > > + * > > + * The cache line could be poisoned (even without MKTME-i), > > clear the > > + * poison bit. > > + */ > > + for (i = 0; i < PAGE_SIZE; i += 64) > > + movdir64b((void *)(page + i), zero_page); > > In patch 11, the clflush is used on the movdir64b target. Should we > also use the clflush here? > > I suppose the clflush is to prevent the unexpected cacheline writeback > after the movdir64b. No, clflush doesn't work. The cache with HKID set needs to be flushed and (potential) poison needs to be cleared. In host OS/VMM, clflush flushes on cache with HKID=0 (or HKID of MKTME). clflush doesn't clear poison. -- Isaku Yamahata <isaku.yamahata@xxxxxxxxx>