On Wed, Feb 07, 2024, Xin Li wrote: > Add FRED VMCS fields to nested VMX context management. > > Todo: change VMCS12_REVISION, as struct vmcs12 is changed. It actually doesn't, the comment is just stale. At this point, KVM must _never_ change VMCS12_REVISION as doing so will break backwards compatibility. I'll post this once I've written a changelog: --- arch/x86/kvm/vmx/vmcs12.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index edf7fcef8ccf..d67bebb9f1c2 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -207,11 +207,9 @@ struct __packed vmcs12 { }; /* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * VMCS12_REVISION is KVM's arbitrary id for the layout of struct vmcs12. * - * IMPORTANT: Changing this value will break save/restore compatibility with + * DO NOT change this value, as it will break save/restore compatibility with * older kvm releases. */ #define VMCS12_REVISION 0x11e57ed0 @@ -225,7 +223,8 @@ struct __packed vmcs12 { #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* - * For save/restore compatibility, the vmcs12 field offsets must not change. + * For save/restore compatibility, the vmcs12 field offsets must not change, + * although appending fields and/or filling gaps is obviously allowed. */ #define CHECK_OFFSET(field, loc) \ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) base-commit: 878fe4c2f7eead383f2b306cbafd300006dd518c -- > Signed-off-by: Xin Li <xin3.li@xxxxxxxxx> > Tested-by: Shan Kang <shan.kang@xxxxxxxxx> > --- > > Change since v1: > * Remove hyperv TLFS related changes (Jeremi Piotrowski). > * Use kvm_cpu_cap_has() instead of cpu_feature_enabled() (Chao Gao). > --- > Documentation/virt/kvm/x86/nested-vmx.rst | 18 +++++ > arch/x86/kvm/vmx/nested.c | 91 +++++++++++++++++++---- > arch/x86/kvm/vmx/vmcs12.c | 18 +++++ > arch/x86/kvm/vmx/vmcs12.h | 36 +++++++++ > arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 + > 5 files changed, 152 insertions(+), 15 deletions(-) > > diff --git a/Documentation/virt/kvm/x86/nested-vmx.rst b/Documentation/virt/kvm/x86/nested-vmx.rst > index e64ef231f310..87fa9f3877ab 100644 > --- a/Documentation/virt/kvm/x86/nested-vmx.rst > +++ b/Documentation/virt/kvm/x86/nested-vmx.rst > @@ -218,6 +218,24 @@ struct shadow_vmcs is ever changed. > u16 host_gs_selector; > u16 host_tr_selector; > u64 secondary_vm_exit_controls; > + u64 guest_ia32_fred_config; > + u64 guest_ia32_fred_rsp1; > + u64 guest_ia32_fred_rsp2; > + u64 guest_ia32_fred_rsp3; > + u64 guest_ia32_fred_stklvls; > + u64 guest_ia32_fred_ssp1; > + u64 guest_ia32_fred_ssp2; > + u64 guest_ia32_fred_ssp3; > + u64 host_ia32_fred_config; > + u64 host_ia32_fred_rsp1; > + u64 host_ia32_fred_rsp2; > + u64 host_ia32_fred_rsp3; > + u64 host_ia32_fred_stklvls; > + u64 host_ia32_fred_ssp1; > + u64 host_ia32_fred_ssp2; > + u64 host_ia32_fred_ssp3; > + u64 injected_event_data; > + u64 original_event_data; > }; > > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index 94da6a0a2f81..f9c1fbeac302 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -686,6 +686,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, > > nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > MSR_KERNEL_GS_BASE, MSR_TYPE_RW); > + > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_IA32_FRED_RSP0, MSR_TYPE_RW); > #endif > nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); > @@ -2498,6 +2501,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 > vmcs12->vm_entry_instruction_len); > vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, > vmcs12->guest_interruptibility_info); > + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) This is wrong, vmcs02 should be set from vmcs12 if and only if the field is enabled in L1's VMX configuration, i.e. iff nested_cpu_has(vmcs12, ???). Note, the ??? should be tied to whatever VMX MSR feature flag enumerates INJECTED_EVENT_DATA. KVM's clearing of X86_FEATURE_FRED when one or more pieces is missing is a software decision, i.e. not archictectural. > + vmcs_write64(INJECTED_EVENT_DATA, vmcs12->injected_event_data); > vmx->loaded_vmcs->nmi_known_unmasked = > !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); > } else { > @@ -2548,6 +2553,17 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) > vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); > vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); > > + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) { Same thing here. > + vmcs_write64(GUEST_IA32_FRED_CONFIG, vmcs12->guest_ia32_fred_config); > + vmcs_write64(GUEST_IA32_FRED_RSP1, vmcs12->guest_ia32_fred_rsp1); > + vmcs_write64(GUEST_IA32_FRED_RSP2, vmcs12->guest_ia32_fred_rsp2); > + vmcs_write64(GUEST_IA32_FRED_RSP3, vmcs12->guest_ia32_fred_rsp3); > + vmcs_write64(GUEST_IA32_FRED_STKLVLS, vmcs12->guest_ia32_fred_stklvls); > + vmcs_write64(GUEST_IA32_FRED_SSP1, vmcs12->guest_ia32_fred_ssp1); > + vmcs_write64(GUEST_IA32_FRED_SSP2, vmcs12->guest_ia32_fred_ssp2); > + vmcs_write64(GUEST_IA32_FRED_SSP3, vmcs12->guest_ia32_fred_ssp3); > + } > + > vmx->segment_cache.bitmask = 0; > } > > @@ -3835,6 +3851,22 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > vcpu->arch.cr4_guest_owned_bits)); > } > > +static inline unsigned long > +nested_vmx_get_event_data(struct kvm_vcpu *vcpu, bool for_ex_vmexit) Heh, two form letters for the price of one: #1 Do not use "inline" for functions that are visible only to the local compilation unit. "inline" is just a hint, and modern compilers are smart enough to inline functions when appropriate without a hint. A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@xxxxxxxxxx #2 Do not wrap before the function name. Linus has a nice explanation/rant on this[*]. [*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@xxxxxxxxxxxxxx > +{ > + struct kvm_queued_exception *ex = for_ex_vmexit ? > + &vcpu->arch.exception_vmexit : &vcpu->arch.exception; > + > + if (ex->has_payload) > + return ex->payload; > + else if (ex->vector == PF_VECTOR) > + return vcpu->arch.cr2; > + else if (ex->vector == DB_VECTOR) > + return (vcpu->arch.dr6 & ~DR6_BT) ^ DR6_ACTIVE_LOW; > + else > + return 0; I'll circle back to this on the next version, i.e. after it's reworked to account for the suggested payload changes. I highly doubt it's correct as-is. > static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12, > u32 vm_exit_reason, u32 exit_intr_info) > @@ -3842,6 +3874,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, > u32 idt_vectoring; > unsigned int nr; > > + vmcs12->original_event_data = 0; > + > /* > * Per the SDM, VM-Exits due to double and triple faults are never > * considered to occur during event delivery, even if the double/triple > @@ -3880,6 +3914,12 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, > vcpu->arch.exception.error_code; > } > > + idt_vectoring |= vcpu->arch.exception.nested ? > + INTR_INFO_NESTED_EXCEPTION_MASK : 0; Please stop using ternary operators this way. It's less readable and the same number of lines as: if (vcpu->arch.exception.nested) idt_vectoring |= INTR_INFO_NESTED_EXCEPTION_MASK; > + > + vmcs12->original_event_data = > + nested_vmx_get_event_data(vcpu, false); > + > vmcs12->idt_vectoring_info_field = idt_vectoring; > } else if (vcpu->arch.nmi_injected) { > vmcs12->idt_vectoring_info_field = > @@ -3970,19 +4010,7 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) > struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; > u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > - unsigned long exit_qual; > - > - if (ex->has_payload) { > - exit_qual = ex->payload; > - } else if (ex->vector == PF_VECTOR) { > - exit_qual = vcpu->arch.cr2; > - } else if (ex->vector == DB_VECTOR) { > - exit_qual = vcpu->arch.dr6; > - exit_qual &= ~DR6_BT; > - exit_qual ^= DR6_ACTIVE_LOW; > - } else { > - exit_qual = 0; > - } > + unsigned long exit_qual = nested_vmx_get_event_data(vcpu, true); This can't possibly be correct, EXIT_QUAL and EVENT_DATA aren't equivalent, e.g. the former doesn't have XFD_ERR, but the latter does. > /* > * Unlike AMD's Paged Real Mode, which reports an error code on #PF > @@ -4003,10 +4031,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) > intr_info |= INTR_INFO_DELIVER_CODE_MASK; > } > > - if (kvm_exception_is_soft(ex->vector)) > + if (kvm_exception_is_soft(ex->vector)) { > intr_info |= INTR_TYPE_SOFT_EXCEPTION; > - else > + } else { > intr_info |= INTR_TYPE_HARD_EXCEPTION; > + intr_info |= ex->nested ? INTR_INFO_NESTED_EXCEPTION_MASK : 0; Again, if (ex->nested) intr_info |= INTR_INFO_NESTED_EXCEPTION_MASK; > + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) { And here > + vmcs12->guest_ia32_fred_config = vmcs_read64(GUEST_IA32_FRED_CONFIG); > + vmcs12->guest_ia32_fred_rsp1 = vmcs_read64(GUEST_IA32_FRED_RSP1); > + vmcs12->guest_ia32_fred_rsp2 = vmcs_read64(GUEST_IA32_FRED_RSP2); > + vmcs12->guest_ia32_fred_rsp3 = vmcs_read64(GUEST_IA32_FRED_RSP3); > + vmcs12->guest_ia32_fred_stklvls = vmcs_read64(GUEST_IA32_FRED_STKLVLS); > + vmcs12->guest_ia32_fred_ssp1 = vmcs_read64(GUEST_IA32_FRED_SSP1); > + vmcs12->guest_ia32_fred_ssp2 = vmcs_read64(GUEST_IA32_FRED_SSP2); > + vmcs12->guest_ia32_fred_ssp3 = vmcs_read64(GUEST_IA32_FRED_SSP3); > + } > + > vmcs12->guest_pending_dbg_exceptions = > vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); > > @@ -4625,6 +4675,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); > vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); > > + if (kvm_cpu_cap_has(X86_FEATURE_FRED)) { And here