On Wed, 2021-05-19 at 00:05 +0000, Sean Christopherson wrote: > On Wed, May 12, 2021, Ilias Stamatis wrote: > > The write_l1_tsc_offset() callback has a misleading name. It does not > > set L1's TSC offset, it rather updates the current TSC offset which > > might be different if a nested guest is executing. Additionally, both > > the vmx and svm implementations use the same logic for calculating the > > current TSC before writing it to hardware. > > I don't disagree, but the current name as the advantage of clarifying (well, > hinting) that the offset is L1's offset. That hint is lost in this refactoring. > Maybe rename "u64 offset" to "u64 l1_tsc_offset"? > > > This patch renames the function and moves the common logic to the > > Use imperative mood instead of "This patch. From > Documentation/process/submitting-patches.rst: > > Describe your changes in imperative mood, e.g. "make xyzzy do frotz" > instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy > to do frotz", as if you are giving orders to the codebase to change > its behaviour. > > > caller. The vmx/svm-specific code now merely sets the given offset to > > the corresponding hardware structure. > > > > Signed-off-by: Ilias Stamatis <ilstam@xxxxxxxxxx> > > --- > > arch/x86/include/asm/kvm-x86-ops.h | 2 +- > > arch/x86/include/asm/kvm_host.h | 3 +-- > > arch/x86/kvm/svm/svm.c | 21 ++++----------------- > > arch/x86/kvm/vmx/vmx.c | 23 +++-------------------- > > arch/x86/kvm/x86.c | 17 ++++++++++++++++- > > 5 files changed, 25 insertions(+), 41 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > > index 2063616fba1c..029c9615378f 100644 > > --- a/arch/x86/include/asm/kvm-x86-ops.h > > +++ b/arch/x86/include/asm/kvm-x86-ops.h > > @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd) > > KVM_X86_OP_NULL(has_wbinvd_exit) > > KVM_X86_OP(get_l2_tsc_offset) > > KVM_X86_OP(get_l2_tsc_multiplier) > > -KVM_X86_OP(write_l1_tsc_offset) > > +KVM_X86_OP(write_tsc_offset) > > KVM_X86_OP(get_exit_info) > > KVM_X86_OP(check_intercept) > > KVM_X86_OP(handle_exit_irqoff) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index 57a25d8e8b0f..61cf201c001a 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1307,8 +1307,7 @@ struct kvm_x86_ops { > > > > u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); > > u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); > > - /* Returns actual tsc_offset set in active VMCS */ > > - u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); > > + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); > > > > /* > > * Retrieve somewhat arbitrary exit information. Intended to be used > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index 679b2fc1a3f9..b18f60463073 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -1094,26 +1094,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) > > return kvm_default_tsc_scaling_ratio; > > } > > > > -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > { > > struct vcpu_svm *svm = to_svm(vcpu); > > - u64 g_tsc_offset = 0; > > - > > - if (is_guest_mode(vcpu)) { > > - /* Write L1's TSC offset. */ > > - g_tsc_offset = svm->vmcb->control.tsc_offset - > > - svm->vmcb01.ptr->control.tsc_offset; > > - svm->vmcb01.ptr->control.tsc_offset = offset; > > - } > > - > > - trace_kvm_write_tsc_offset(vcpu->vcpu_id, > > - svm->vmcb->control.tsc_offset - g_tsc_offset, > > - offset); > > - > > - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; > > > > + svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; > > + svm->vmcb->control.tsc_offset = offset; > > vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); > > - return svm->vmcb->control.tsc_offset; > > } > > > > /* Evaluate instruction intercepts that depend on guest CPUID features. */ > > @@ -4540,7 +4527,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { > > > > .get_l2_tsc_offset = svm_get_l2_tsc_offset, > > .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, > > - .write_l1_tsc_offset = svm_write_l1_tsc_offset, > > + .write_tsc_offset = svm_write_tsc_offset, > > > > .load_mmu_pgd = svm_load_mmu_pgd, > > > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > index 575e13bddda8..3c4eb14a1e86 100644 > > --- a/arch/x86/kvm/vmx/vmx.c > > +++ b/arch/x86/kvm/vmx/vmx.c > > @@ -1810,26 +1810,9 @@ static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) > > return multiplier; > > } > > > > -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > { > > - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > > - u64 g_tsc_offset = 0; > > - > > - /* > > - * We're here if L1 chose not to trap WRMSR to TSC. According > > - * to the spec, this should set L1's TSC; The offset that L1 > > - * set for L2 remains unchanged, and still needs to be added > > - * to the newly set TSC to get L2's TSC. > > - */ > > - if (is_guest_mode(vcpu) && > > - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) > > - g_tsc_offset = vmcs12->tsc_offset; > > - > > - trace_kvm_write_tsc_offset(vcpu->vcpu_id, > > - vcpu->arch.tsc_offset - g_tsc_offset, > > - offset); > > - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); > > - return offset + g_tsc_offset; > > + vmcs_write64(TSC_OFFSET, offset); > > } > > > > /* > > @@ -7725,7 +7708,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { > > > > .get_l2_tsc_offset = vmx_get_l2_tsc_offset, > > .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, > > - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, > > + .write_tsc_offset = vmx_write_tsc_offset, > > > > .load_mmu_pgd = vmx_load_mmu_pgd, > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 1db6cfc2079f..f3ba1be4d5b9 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier); > > > > static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > > { > > + trace_kvm_write_tsc_offset(vcpu->vcpu_id, > > + vcpu->arch.l1_tsc_offset, > > + offset); > > + > > vcpu->arch.l1_tsc_offset = offset; > > - vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); > > + vcpu->arch.tsc_offset = offset; > > + > > + if (is_guest_mode(vcpu)) { > > Unnecessary curly braces. Really? We are supposed to have a 6-lines body without brackets? I'm not opposing, I'm just surprised that that's the coding standard. > > > + /* > > + * We're here if L1 chose not to trap WRMSR to TSC and > > + * according to the spec this should set L1's TSC (as opposed > > + * to setting L1's offset for L2). > > + */ > > While we're shuffling code, can we improve this comment? It works for the WRMSR > case, but makes no sense in the context of host TSC adjustments. It's not at all > clear to me that it's even correct or relevant in those cases. > Do you suggest removing it completely or how do you want it to be? I don't mind deleting it. > > + kvm_set_02_tsc_offset(vcpu); > > I really dislike that kvm_set_02_tsc_offset() consumes a bunch of variables > _and_ sets arch.tsc_offset, e.g. it's not at all obvious that moving this call > around will break L2. Even more bizarre is that arch.tsc_offset is conditionally > consumed. Oh, and kvm_set_02_tsc_offset() is not idempotent since it can do a > RMW on arch.tsc_offset. > > The below static_call() dependency doesn't appear to be easily solved, but we > can at least strongly hint that vcpu->arch.tsc_offset is set. For everything > else, I think we can clean things up by doing this (with the vendor calls > providing the L2 variables directly): > > void kvm_set_l2_tsc_offset(struct kvm_vcpu *vcpu, u64 l2_offset, > u64 l2_multiplier) > { > u64 l1_offset = vcpu->arch.l1_tsc_offset; > > if (l2_multiplier != kvm_default_tsc_scaling_ratio) > l2_offset += mul_s64_u64_shr((s64)l1_tsc_offset, l2_multiplier, > kvm_tsc_scaling_ratio_frac_bits); > > vcpu->arch.tsc_offset = l2_offset; > } > EXPORT_SYMBOL_GPL(kvm_get_l2_tsc_offset); > > static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) > { > trace_kvm_write_tsc_offset(vcpu->vcpu_id, > vcpu->arch.l1_tsc_offset, > offset); > > vcpu->arch.l1_tsc_offset = offset; > > if (is_guest_mode(vcpu)) > kvm_set_l2_tsc_offset(vcpu, > static_call(kvm_x86_get_l2_tsc_offset)(vcpu), > static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); > else > vcpu->arch.tsc_offset = offset; > > static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); > } > OK, I will change it to what you suggest above. > > An alternative would be to explicitly track L1 and L2, and _never_ track the > current TSC values. I.e. always compute the correct value on the fly. I think > the only hot path is the TSC deadline timer, and AFAICT that always runs with > L1's timer. Speaking of which, at the end of this series, vmx_set_hv_timer() > uses L1's TSC but the current scaling ratio; that can't be right. > You are right, good catch, I will add this to patch 2. I think let's leave the vcpu->arch.tsc_scaling_ratio variable as is for now. > > + } > > + > > + static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); > > } > > > > static inline bool kvm_check_tsc_unstable(void) > > -- > > 2.17.1 > >