On Sun, 2021-05-30 at 12:51 +0300, Maxim Levitsky wrote: > On Fri, 2021-05-28 at 11:57 +0100, Ilias Stamatis wrote: > > Currently vmx_vcpu_load_vmcs() writes the TSC_MULTIPLIER field of the > > VMCS every time the VMCS is loaded. Instead of doing this, set this > > field from common code on initialization and whenever the scaling ratio > > changes. > > > > Additionally remove vmx->current_tsc_ratio. This field is redundant as > > vcpu->arch.tsc_scaling_ratio already tracks the current TSC scaling > > ratio. The vmx->current_tsc_ratio field is only used for avoiding > > unnecessary writes but it is no longer needed after removing the code > > from the VMCS load path. > > > > Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx> > > Signed-off-by: Ilias Stamatis <ilstam@xxxxxxxxxx> > > --- > > arch/x86/include/asm/kvm-x86-ops.h | 1 + > > arch/x86/include/asm/kvm_host.h | 1 + > > arch/x86/kvm/svm/svm.c | 6 ++++++ > > arch/x86/kvm/vmx/nested.c | 9 ++++----- > > arch/x86/kvm/vmx/vmx.c | 11 ++++++----- > > arch/x86/kvm/vmx/vmx.h | 8 -------- > > arch/x86/kvm/x86.c | 30 +++++++++++++++++++++++------- > > 7 files changed, 41 insertions(+), 25 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > > index 029c9615378f..34ad7a17458a 100644 > > --- a/arch/x86/include/asm/kvm-x86-ops.h > > +++ b/arch/x86/include/asm/kvm-x86-ops.h > > @@ -90,6 +90,7 @@ KVM_X86_OP_NULL(has_wbinvd_exit) > > KVM_X86_OP(get_l2_tsc_offset) > > KVM_X86_OP(get_l2_tsc_multiplier) > > KVM_X86_OP(write_tsc_offset) > > +KVM_X86_OP(write_tsc_multiplier) > > KVM_X86_OP(get_exit_info) > > KVM_X86_OP(check_intercept) > > KVM_X86_OP(handle_exit_irqoff) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index f099277b993d..a334ce7741ab 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1308,6 +1308,7 @@ struct kvm_x86_ops { > > u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); > > u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); > > void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); > > + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); > > > > /* > > * Retrieve somewhat arbitrary exit information. Intended to be used > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index 8dfb2513b72a..cb701b42b08b 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -1103,6 +1103,11 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); > > } > > > > +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) > > +{ > > + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); > > +} > > + > > /* Evaluate instruction intercepts that depend on guest CPUID features. */ > > static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, > > struct vcpu_svm *svm) > > @@ -4528,6 +4533,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { > > .get_l2_tsc_offset = svm_get_l2_tsc_offset, > > .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, > > .write_tsc_offset = svm_write_tsc_offset, > > + .write_tsc_multiplier = svm_write_tsc_multiplier, > > > > .load_mmu_pgd = svm_load_mmu_pgd, > > > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > > index 6058a65a6ede..239154d3e4e7 100644 > > --- a/arch/x86/kvm/vmx/nested.c > > +++ b/arch/x86/kvm/vmx/nested.c > > @@ -2533,9 +2533,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, > > } > > > > vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); > > - > > if (kvm_has_tsc_control) > > - decache_tsc_multiplier(vmx); > > + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); > > We still end up writing the TSC_MULTIPLIER in the vmcs02 on each nested VM entry > almost always for nothing since for the vast majority of the entries we will > write the same value. > Yes, I already addressed this but as per Sean's response this should be alright for now and it can be "fixed" in a separate series. This patch completely removes the vmwrite from the VMCS01 load path which is called more frequently anyway. > It is probably OK for now to leave it like that, and then add some sort of 'dirty' tracking > to track if the userspace or L1 changed the TSC multiplier for L2 (L1 writes to vmcb12 > are tracked by using the 'dirty_vmcs' flag, assuming that we don't shadow TSC_MULTIPLIER field) > > So the above should later go to prepare_vmcs02_rare, and it should also be done if > host TSC multiplier changed (not a problem IMHO to have another piece of code doing that). > > > > > > nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); > > > > @@ -4501,12 +4500,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, > > vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); > > vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); > > vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); > > + if (kvm_has_tsc_control) > > + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); > > This I think isn't needed, since this write is done after we have already > switched to vmcs01, and it should have the L1 value it had prior > to the nested entry. > What if userspace changed L1's multiplier while L2 was active? > > > + > > if (vmx->nested.l1_tpr_threshold != -1) > > vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); > > > > - if (kvm_has_tsc_control) > > - decache_tsc_multiplier(vmx); > > - > > if (vmx->nested.change_vmcs01_virtual_apic_mode) { > > vmx->nested.change_vmcs01_virtual_apic_mode = false; > > vmx_set_virtual_apic_mode(vcpu); > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > index 4b70431c2edd..bf845a08995e 100644 > > --- a/arch/x86/kvm/vmx/vmx.c > > +++ b/arch/x86/kvm/vmx/vmx.c > > @@ -1390,11 +1390,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, > > > > vmx->loaded_vmcs->cpu = cpu; > > } > > - > > - /* Setup TSC multiplier */ > > - if (kvm_has_tsc_control && > > - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) > > - decache_tsc_multiplier(vmx); > > } > > > > /* > > @@ -1813,6 +1808,11 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > vmcs_write64(TSC_OFFSET, offset); > > } > > > > +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) > > +{ > > + vmcs_write64(TSC_MULTIPLIER, multiplier); > > +} > > + > > /* > > * nested_vmx_allowed() checks whether a guest should be allowed to use VMX > > * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for > > @@ -7707,6 +7707,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { > > .get_l2_tsc_offset = vmx_get_l2_tsc_offset, > > .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, > > .write_tsc_offset = vmx_write_tsc_offset, > > + .write_tsc_multiplier = vmx_write_tsc_multiplier, > > > > .load_mmu_pgd = vmx_load_mmu_pgd, > > > > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > > index aa97c82e3451..3eaa86a0ba3e 100644 > > --- a/arch/x86/kvm/vmx/vmx.h > > +++ b/arch/x86/kvm/vmx/vmx.h > > @@ -322,8 +322,6 @@ struct vcpu_vmx { > > /* apic deadline value in host tsc */ > > u64 hv_deadline_tsc; > > > > - u64 current_tsc_ratio; > > - > > unsigned long host_debugctlmsr; > > > > /* > > @@ -532,12 +530,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) > > GFP_KERNEL_ACCOUNT); > > } > > > > -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) > > -{ > > - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; > > - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); > > -} > > - > > static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) > > { > > return vmx->secondary_exec_control & > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 801fa1e8e915..c1e14dadad2d 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -2179,14 +2179,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) > > return v; > > } > > > > +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); > > + > > static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) > > { > > u64 ratio; > > > > /* Guest TSC same frequency as host TSC? */ > > if (!scale) { > > - vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; > > - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; > > + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); > > return 0; > > } > > > > @@ -2212,7 +2213,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) > > return -1; > > } > > > > - vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio; > > + kvm_vcpu_write_tsc_multiplier(vcpu, ratio); > > return 0; > > } > > > > @@ -2224,8 +2225,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) > > /* tsc_khz can be zero if TSC calibration fails */ > > if (user_tsc_khz == 0) { > > /* set tsc_scaling_ratio to a safe value */ > > - vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; > > - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; > > + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); > > return -1; > > } > > > > @@ -2383,6 +2383,23 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) > > static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); > > } > > > > +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) > > +{ > > + vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; > > + > > + /* Userspace is changing the multiplier while L2 is active */ > > Nitpick about the comment: > On SVM, the TSC multiplier is a MSR, so a crazy L1 can give L2 access to it, > so L2 can in theory change its TSC multiplier as well > (I am not sure if this is even allowed by the SVM spec) > If L1 chooses not to trap a WRMSR to the multiplier this should still change L1's multiplier, not L2's, right? It's the exact same case we have in kvm_vcpu_write_tsc_offset() too and the comment there addresses exactly this. > > + if (is_guest_mode(vcpu)) > > + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( > > + l1_multiplier, > > + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); > > + else > > + vcpu->arch.tsc_scaling_ratio = l1_multiplier; > > + > > + if (kvm_has_tsc_control) > > + static_call(kvm_x86_write_tsc_multiplier)( > > + vcpu, vcpu->arch.tsc_scaling_ratio); > > +} > > + > > static inline bool kvm_check_tsc_unstable(void) > > { > > #ifdef CONFIG_X86_64 > > @@ -10343,8 +10360,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > > else > > vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; > > > > - kvm_set_tsc_khz(vcpu, max_tsc_khz); > > - > > r = kvm_mmu_create(vcpu); > > if (r < 0) > > return r; > > @@ -10443,6 +10458,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) > > if (mutex_lock_killable(&vcpu->mutex)) > > return; > > vcpu_load(vcpu); > > + kvm_set_tsc_khz(vcpu, max_tsc_khz); > > kvm_synchronize_tsc(vcpu, 0); > > vcpu_put(vcpu); > > > > > Best regards, > Maxim Levitsky > >