On Tue, Jan 23, 2024 at 06:41:57PM -0800, Yang Weijiang wrote: >Expose CET features to guest if KVM/host can support them, clear CPUID >feature bits if KVM/host cannot support. > >Set CPUID feature bits so that CET features are available in guest CPUID. >Add CR4.CET bit support in order to allow guest set CET master control >bit. > >Disable KVM CET feature if unrestricted_guest is unsupported/disabled as >KVM does not support emulating CET. > >The CET load-bits in VM_ENTRY/VM_EXIT control fields should be set to make >guest CET xstates isolated from host's. > >On platforms with VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error >code will fail, and if VMX_BASIC[bit56] == 1, #CP injection with or without >error code is allowed. Disable CET feature bits if the MSR bit is cleared >so that nested VMM can inject #CP if and only if VMX_BASIC[bit56] == 1. > >Don't expose CET feature if either of {U,S}_CET xstate bits is cleared >in host XSS or if XSAVES isn't supported. > >CET MSR contents after reset, power-up and INIT are set to 0s, clears the >guest fpstate fields so that the guest MSRs are reset to 0s after the events. > >Signed-off-by: Yang Weijiang <weijiang.yang@xxxxxxxxx> >Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx> >--- > arch/x86/include/asm/kvm_host.h | 2 +- > arch/x86/include/asm/msr-index.h | 1 + > arch/x86/kvm/cpuid.c | 25 ++++++++++++++++++++----- > arch/x86/kvm/vmx/capabilities.h | 6 ++++++ > arch/x86/kvm/vmx/vmx.c | 28 +++++++++++++++++++++++++++- > arch/x86/kvm/vmx/vmx.h | 6 ++++-- > arch/x86/kvm/x86.c | 31 +++++++++++++++++++++++++++++-- > arch/x86/kvm/x86.h | 3 +++ > 8 files changed, 91 insertions(+), 11 deletions(-) > >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >index 6efaaaa15945..161d0552be5f 100644 >--- a/arch/x86/include/asm/kvm_host.h >+++ b/arch/x86/include/asm/kvm_host.h >@@ -134,7 +134,7 @@ > | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ > | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ > | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ >- | X86_CR4_LAM_SUP)) >+ | X86_CR4_LAM_SUP | X86_CR4_CET)) > > #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) > >diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h >index 1d51e1850ed0..233e00c01e62 100644 >--- a/arch/x86/include/asm/msr-index.h >+++ b/arch/x86/include/asm/msr-index.h >@@ -1102,6 +1102,7 @@ > #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU > #define VMX_BASIC_MEM_TYPE_WB 6LLU > #define VMX_BASIC_INOUT 0x0040000000000000LLU >+#define VMX_BASIC_NO_HW_ERROR_CODE_CC 0x0100000000000000LLU > > /* Resctrl MSRs: */ > /* - Intel: */ >diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c >index 95233b0879a3..fddc54991cd4 100644 >--- a/arch/x86/kvm/cpuid.c >+++ b/arch/x86/kvm/cpuid.c >@@ -150,14 +150,14 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, > return -EINVAL; > } > /* >- * Prevent 32-bit guest launch if shadow stack is exposed as SSP >- * state is not defined for 32-bit SMRAM. >+ * CET is not supported for 32-bit guest, prevent guest launch if >+ * shadow stack or IBT is enabled for 32-bit guest. > */ > best = cpuid_entry2_find(entries, nent, 0x80000001, > KVM_CPUID_INDEX_NOT_SIGNIFICANT); > if (best && !(best->edx & F(LM))) { > best = cpuid_entry2_find(entries, nent, 0x7, 0); >- if (best && (best->ecx & F(SHSTK))) >+ if (best && ((best->ecx & F(SHSTK)) || (best->edx & F(IBT)))) IBT has nothing to do with SSP. why bother to do this? > return -EINVAL; > } > >@@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) > F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | > F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | > F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | >- F(SGX_LC) | F(BUS_LOCK_DETECT) >+ F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK) > ); > /* Set LA57 based on hardware capability. */ > if (cpuid_ecx(7) & F(LA57)) >@@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void) > F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | > F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | > F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | >- F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) >+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) | >+ F(IBT) > ); > > /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ >@@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void) > kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); > if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) > kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); >+ /* >+ * Don't use boot_cpu_has() to check availability of IBT because the >+ * feature bit is cleared in boot_cpu_data when ibt=off is applied >+ * in host cmdline. >+ * >+ * As currently there's no HW bug which requires disabling IBT feature >+ * while CPU can enumerate it, host cmdline option ibt=off is most >+ * likely due to administrative reason on host side, so KVM refers to >+ * CPU CPUID enumeration to enable the feature. In future if there's >+ * actually some bug clobbered ibt=off option, then enforce additional >+ * check here to disable the support in KVM. >+ */ >+ if (cpuid_edx(7) & F(IBT)) >+ kvm_cpu_cap_set(X86_FEATURE_IBT); This can be done in a separate patch. And we don't know whether IBT is cleared due to ibt=off. It could be due to lack of IBT on some CPUs; advertising IBT in this case is incorrect. > > kvm_cpu_cap_mask(CPUID_7_1_EAX, > F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | >diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h >index ee8938818c8a..e12bc233d88b 100644 >--- a/arch/x86/kvm/vmx/capabilities.h >+++ b/arch/x86/kvm/vmx/capabilities.h >@@ -79,6 +79,12 @@ static inline bool cpu_has_vmx_basic_inout(void) > return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); > } > >+static inline bool cpu_has_vmx_basic_no_hw_errcode(void) >+{ >+ return ((u64)vmcs_config.basic_cap << 32) & >+ VMX_BASIC_NO_HW_ERROR_CODE_CC; >+} >+ > static inline bool cpu_has_virtual_nmis(void) > { > return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && >diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c >index 3993afbacd51..ef7aca954228 100644 >--- a/arch/x86/kvm/vmx/vmx.c >+++ b/arch/x86/kvm/vmx/vmx.c >@@ -2609,6 +2609,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, > { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, > { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, > { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, >+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, > }; > > memset(vmcs_conf, 0, sizeof(*vmcs_conf)); >@@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ > >+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) .. >+ vmcs_writel(GUEST_SSP, 0); >+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || >+ kvm_cpu_cap_has(X86_FEATURE_IBT)) >+ vmcs_writel(GUEST_S_CET, 0); >+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) >+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0); At least this can be merged with the first if-statement. how about: if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { vmcs_writel(GUEST_SSP, 0); vmcs_writel(GUEST_S_CET, 0); vmcs_writel(GUEST_INTR_SSP_TABLE, 0); } else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) { vmcs_writel(GUEST_S_CET, 0); } > kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); > > vpid_sync_context(vmx->vpid); >@@ -6353,6 +6362,12 @@ void dump_vmcs(struct kvm_vcpu *vcpu) > if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) > vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); > >+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) { >+ pr_err("S_CET = 0x%016lx\n", vmcs_readl(GUEST_S_CET)); >+ pr_err("SSP = 0x%016lx\n", vmcs_readl(GUEST_SSP)); >+ pr_err("INTR SSP TABLE = 0x%016lx\n", >+ vmcs_readl(GUEST_INTR_SSP_TABLE)); how about merging them into one line? >+ } > pr_err("*** Host State ***\n"); > pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", > vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); >@@ -6430,6 +6445,12 @@ void dump_vmcs(struct kvm_vcpu *vcpu) > if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) > pr_err("Virtual processor ID = 0x%04x\n", > vmcs_read16(VIRTUAL_PROCESSOR_ID)); >+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) { >+ pr_err("S_CET = 0x%016lx\n", vmcs_readl(HOST_S_CET)); >+ pr_err("SSP = 0x%016lx\n", vmcs_readl(HOST_SSP)); >+ pr_err("INTR SSP TABLE = 0x%016lx\n", >+ vmcs_readl(HOST_INTR_SSP_TABLE)); ditto. >+ } > } > > /* >@@ -7965,7 +7986,6 @@ static __init void vmx_set_cpu_caps(void) > kvm_cpu_cap_set(X86_FEATURE_UMIP); > > /* CPUID 0xD.1 */ >- kvm_caps.supported_xss = 0; > if (!cpu_has_vmx_xsaves()) > kvm_cpu_cap_clear(X86_FEATURE_XSAVES); > >@@ -7977,6 +7997,12 @@ static __init void vmx_set_cpu_caps(void) > > if (cpu_has_vmx_waitpkg()) > kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); >+ >+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || >+ !cpu_has_vmx_basic_no_hw_errcode()) { Can you add a comment here? This way, readers won't need to dig through git history to understand the reason. >+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >+ kvm_cpu_cap_clear(X86_FEATURE_IBT); >+ } > } > > static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) >diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h >index e3b0985bb74a..d0cad2624564 100644 >--- a/arch/x86/kvm/vmx/vmx.h >+++ b/arch/x86/kvm/vmx/vmx.h >@@ -484,7 +484,8 @@ static inline u8 vmx_get_rvi(void) > VM_ENTRY_LOAD_IA32_EFER | \ > VM_ENTRY_LOAD_BNDCFGS | \ > VM_ENTRY_PT_CONCEAL_PIP | \ >- VM_ENTRY_LOAD_IA32_RTIT_CTL) >+ VM_ENTRY_LOAD_IA32_RTIT_CTL | \ >+ VM_ENTRY_LOAD_CET_STATE) > > #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ > (VM_EXIT_SAVE_DEBUG_CONTROLS | \ >@@ -506,7 +507,8 @@ static inline u8 vmx_get_rvi(void) > VM_EXIT_LOAD_IA32_EFER | \ > VM_EXIT_CLEAR_BNDCFGS | \ > VM_EXIT_PT_CONCEAL_PIP | \ >- VM_EXIT_CLEAR_IA32_RTIT_CTL) >+ VM_EXIT_CLEAR_IA32_RTIT_CTL | \ >+ VM_EXIT_LOAD_CET_STATE) > > #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ > (PIN_BASED_EXT_INTR_MASK | \ >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >index 9596763fae8d..eb531823447a 100644 >--- a/arch/x86/kvm/x86.c >+++ b/arch/x86/kvm/x86.c >@@ -231,7 +231,8 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; > | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ > | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) > >-#define KVM_SUPPORTED_XSS 0 >+#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_USER | \ >+ XFEATURE_MASK_CET_KERNEL) > > u64 __read_mostly host_efer; > EXPORT_SYMBOL_GPL(host_efer); >@@ -9921,6 +9922,20 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) > if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) > kvm_caps.supported_xss = 0; > >+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && >+ !kvm_cpu_cap_has(X86_FEATURE_IBT)) >+ kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | >+ XFEATURE_MASK_CET_KERNEL); >+ >+ if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER | >+ XFEATURE_MASK_CET_KERNEL)) != >+ (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) { >+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK); >+ kvm_cpu_cap_clear(X86_FEATURE_IBT); >+ kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER | >+ XFEATURE_MASK_CET_KERNEL); >+ } I am not sure why this is necessary. Could you please explain?