On 08/11/18 01:24, Jim Mattson wrote: > On Tue, Nov 6, 2018 at 2:14 AM, Liran Alon <liran.alon@xxxxxxxxxx> wrote: >> From: Leonid Shatz <leonid.shatz@xxxxxxxxxx> >> >> Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to >> represent the running guest"), vcpu->arch.tsc_offset meaning was >> changed to always reflect the tsc_offset value set on active VMCS. >> Regardless if vCPU is currently running L1 or L2. >> >> However, above mentioned commit failed to also change >> kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly. >> This is because vmx_write_tsc_offset() could set the tsc_offset value >> in active VMCS to given offset parameter *plus vmcs12->tsc_offset*. >> However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset >> to given offset parameter. Without taking into account the possible >> addition of vmcs12->tsc_offset. (Same is true for SVM case). >> >> Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return >> actually set tsc_offset in active VMCS and modify >> kvm_vcpu_write_tsc_offset() to set returned value in >> vcpu->arch.tsc_offset. >> In addition, rename write_tsc_offset() callback to write_l1_tsc_offset() >> to make it clear that it is meant to set L1 TSC offset. > > I think write_l1_tsc_offset() is misleading, since the TSC offset > that's actually written in guest mode is L2's TSC offset. Perhaps it > would be more clear to simply rename the argument from 'offset' to > 'l1_tsc_offset'? > >> Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest") >> >> Reviewed-by: Liran Alon <liran.alon@xxxxxxxxxx> >> Reviewed-by: Mihai Carabas <mihai.carabas@xxxxxxxxxx> >> Reviewed-by: Krish Sadhukhan <krish.sadhukhan@xxxxxxxxxx> >> Signed-off-by: Leonid Shatz <leonid.shatz@xxxxxxxxxx> >> --- >> arch/x86/include/asm/kvm_host.h | 3 ++- >> arch/x86/kvm/svm.c | 5 +++-- >> arch/x86/kvm/vmx.c | 18 +++++++++--------- >> arch/x86/kvm/x86.c | 6 +++--- >> 4 files changed, 17 insertions(+), 15 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 55e51ff7e421..fbda5a917c5b 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -1094,7 +1094,8 @@ struct kvm_x86_ops { >> bool (*has_wbinvd_exit)(void); >> >> u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); >> - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); >> + /* Returns actual tsc_offset set in active VMCS */ > > 'VMCS' is Intel-centric. Perhaps 'Returns the actual TSC offset set > for the active guest'? > >> + u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); >> >> void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); >> >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index 0e21ccc46792..db788dc5f1e7 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) >> return vcpu->arch.tsc_offset; >> } >> >> -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> { >> struct vcpu_svm *svm = to_svm(vcpu); >> u64 g_tsc_offset = 0; >> @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> svm->vmcb->control.tsc_offset = offset + g_tsc_offset; >> >> mark_dirty(svm->vmcb, VMCB_INTERCEPTS); >> + return svm->vmcb->control.tsc_offset; >> } >> >> static void avic_init_vmcb(struct vcpu_svm *svm) >> @@ -7149,7 +7150,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { >> .has_wbinvd_exit = svm_has_wbinvd_exit, >> >> .read_l1_tsc_offset = svm_read_l1_tsc_offset, >> - .write_tsc_offset = svm_write_tsc_offset, >> + .write_l1_tsc_offset = svm_write_l1_tsc_offset, >> >> .set_tdp_cr3 = set_tdp_cr3, >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 4555077d69ce..59633175fe6c 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -3455,8 +3455,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) >> /* >> * writes 'offset' into guest's timestamp counter offset register >> */ > > The comment above needs some clarification, since 'offset' is only > written to the guest's TSC offset field if L1 is active. When L2 is > active, a different value is calculated and written to the guest's TSC > offset field. I just removed the comment, it's more misleading than anything else. I don't like that much the new kvm_x86_ops name, but at least it's consistent with read_l1_tsc_offset. With some cleanup and making the trace event available for L2 writes too, I'm squashing this in: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2085539149ed..f03c46c183d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3464,27 +3464,26 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) return vcpu->arch.tsc_offset; } -/* - * writes 'offset' into guest's timestamp counter offset register - */ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + u64 old_offset = vcpu->arch.tsc_offset; u64 active_offset = offset; - if (is_guest_mode(vcpu)) { + + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) { /* * We're here if L1 chose not to trap WRMSR to TSC. According * to the spec, this should set L1's TSC; The offset that L1 * set for L2 remains unchanged, and still needs to be added * to the newly set TSC to get L2's TSC. */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) - active_offset += vmcs12->tsc_offset; - } else { - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vmcs_read64(TSC_OFFSET), offset); + old_offset -= vmcs12->tsc_offset; + active_offset += vmcs12->tsc_offset; } + trace_kvm_write_tsc_offset(vcpu->vcpu_id, old_offset, offset); vmcs_write64(TSC_OFFSET, active_offset); return active_offset; } Paolo >> -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> { >> + u64 active_offset = offset; >> if (is_guest_mode(vcpu)) { >> /* >> * We're here if L1 chose not to trap WRMSR to TSC. According >> @@ -3464,17 +3465,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) >> * set for L2 remains unchanged, and still needs to be added >> * to the newly set TSC to get L2's TSC. >> */ >> - struct vmcs12 *vmcs12; >> - /* recalculate vmcs02.TSC_OFFSET: */ >> - vmcs12 = get_vmcs12(vcpu); >> - vmcs_write64(TSC_OFFSET, offset + >> - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? >> - vmcs12->tsc_offset : 0)); >> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); >> + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) >> + active_offset += vmcs12->tsc_offset; >> } else { >> trace_kvm_write_tsc_offset(vcpu->vcpu_id, >> vmcs_read64(TSC_OFFSET), offset); > > Why do we only trace for L1? > >> - vmcs_write64(TSC_OFFSET, offset); >> } >> + >> + vmcs_write64(TSC_OFFSET, active_offset); >> + return active_offset; >> }