On Tue, Jun 21, 2011 at 05:23:01AM -0700, Zachary Amsden wrote: > > > -------- Original Message -------- > Subject: [KVM TSC emulation 9/9] Add software TSC emulation > Date: Mon, 20 Jun 2011 16:59:37 -0700 > From: Zachary Amsden <zamsden@xxxxxxxxxx> > To: Avi Kivity <avi@xxxxxxxxxx>, Marcelo Tosatti > <mtosatti@xxxxxxxxxx>, Glauber Costa <glommer@xxxxxxxxxx>, Frank > Arnold <farnold@xxxxxxxxxx>, Joerg Roedel <joerg.roedel@xxxxxxx>, > Jan Kiszka <jan.kiszka@xxxxxxxxxxx>, linux-kvm@xxxxxxxxxxxxxxx, > linux-kernel@xxxxxxxxxxxxxxx, Zachary Amsden <zamsden@xxxxxxxxx>, > Avi Kivity <avi@xxxxxxxxxx>, Marcelo Tosatti <mtosatti@xxxxxxxxxx>, > Glauber Costa <glommer@xxxxxxxxxx>, Frank Arnold > <farnold@xxxxxxxxxx>, Joerg Roedel <joerg.roedel@xxxxxxx>, Jan > Kiszka <jan.kiszka@xxxxxxxxxxx>, linux-kvm@xxxxxxxxxxxxxxx > CC: Zachary Amsden <zamsden@xxxxxxxxxx>, Zachary Amsden > <zamsden@xxxxxxxxx> > > > > When hardware assistance is unavailable to scale the TSC, or it is > not possible to keep in sync, add a software virtualization mode > where the TSC is trapped and thus guaranteed to always have perfect > synchronization. > > Currently this behavior defaults to on; how and when the decision to > use trapping is made is likely to be a matter of debate. For now, > just make it possible. How is management software supposed to use these interfaces? Does it allow migration to hosts with slower TSC, or does it enable TSC trapping instead, etc? It would be good to have a document with guidelines, otherwise i believe the interface will not be used. Patchset looks good, except the fact there is not a clear picture for management on how to use this. > Signed-off-by: Zachary Amsden<zamsden@xxxxxxxxxx> > --- > arch/x86/kvm/svm.c | 26 +++++++++++++++++++++++++- > arch/x86/kvm/vmx.c | 28 +++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 34 +++++++++++++++++++++++----------- > arch/x86/kvm/x86.h | 5 +++++ > 4 files changed, 80 insertions(+), 13 deletions(-) > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index dcab00e..fc4583d 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -185,6 +185,7 @@ module_param(nested, int, S_IRUGO); > > static void svm_flush_tlb(struct kvm_vcpu *vcpu); > static void svm_complete_interrupts(struct vcpu_svm *svm); > +static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap); > > static int nested_svm_exit_handled(struct vcpu_svm *svm); > static int nested_svm_intercept(struct vcpu_svm *svm); > @@ -912,13 +913,18 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) > u64 khz; > > /* Guest TSC same frequency as host TSC? */ > - if (!scale) { > + if (!scale&& !check_tsc_unstable()) { > svm->tsc_ratio = TSC_RATIO_DEFAULT; > return; > } > > /* TSC scaling supported? */ > if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { > + if (kvm_software_tsc) { > + pr_debug("kvm: using TSC trapping\n"); > + svm_set_tsc_trapping(vcpu, true); > + return; > + } > if (user_tsc_khz> tsc_khz) { > vcpu->arch.tsc_catchup = 1; > vcpu->arch.tsc_always_catchup = 1; If TSC catchup can be used, there is no need to enable TSC trapping right? > @@ -1184,6 +1190,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > svm->vmcb_pa = page_to_pfn(page)<< PAGE_SHIFT; > svm->asid_generation = 0; > init_vmcb(svm); > + kvm_set_tsc_khz(&svm->vcpu, kvm_max_tsc_khz); > kvm_write_tsc(&svm->vcpu, 0); > > err = fx_init(&svm->vcpu); > @@ -1303,6 +1310,15 @@ static void svm_clear_vintr(struct vcpu_svm *svm) > clr_intercept(svm, INTERCEPT_VINTR); > } > > +static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + if (trap) > + set_intercept(svm, INTERCEPT_RDTSC); > + else > + clr_intercept(svm, INTERCEPT_RDTSC); > +} > + > static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) > { > struct vmcb_save_area *save =&to_svm(vcpu)->vmcb->save; > @@ -2732,6 +2748,13 @@ static int task_switch_interception(struct vcpu_svm *svm) > return 1; > } > > +static int rdtsc_interception(struct vcpu_svm *svm) > +{ > + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; > + kvm_read_tsc(&svm->vcpu); > + return 1; > +} > + > static int cpuid_interception(struct vcpu_svm *svm) > { > svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; > @@ -3178,6 +3201,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { > [SVM_EXIT_SMI] = nop_on_interception, > [SVM_EXIT_INIT] = nop_on_interception, > [SVM_EXIT_VINTR] = interrupt_window_interception, > + [SVM_EXIT_RDTSC] = rdtsc_interception, > [SVM_EXIT_CPUID] = cpuid_interception, > [SVM_EXIT_IRET] = iret_interception, > [SVM_EXIT_INVD] = emulate_on_interception, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 780fe12..65066b4 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -606,6 +606,7 @@ static void kvm_cpu_vmxon(u64 addr); > static void kvm_cpu_vmxoff(void); > static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); > static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); > +static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap); > > static DEFINE_PER_CPU(struct vmcs *, vmxarea); > static DEFINE_PER_CPU(struct vmcs *, current_vmcs); > @@ -1756,9 +1757,14 @@ static u64 guest_read_tsc(void) > */ > static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) > { > - if (!scale) > + if (!scale&& !check_tsc_unstable()) > return; > > + if (kvm_software_tsc) { > + pr_debug("kvm: using TSC trapping\n"); > + vmx_set_tsc_trapping(vcpu, true); > + return; > + } > if (user_tsc_khz> tsc_khz) { > vcpu->arch.tsc_catchup = 1; > vcpu->arch.tsc_always_catchup = 1; > @@ -3695,6 +3701,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); > set_cr4_guest_host_mask(vmx); > > + kvm_set_tsc_khz(&vmx->vcpu, kvm_max_tsc_khz); > kvm_write_tsc(&vmx->vcpu, 0); > > return 0; > @@ -3997,6 +4004,18 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) > return 0; > } > > +static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap) > +{ > + u32 cpu_based_vm_exec_control; > + > + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); > + if (trap) > + cpu_based_vm_exec_control |= CPU_BASED_RDTSC_EXITING; > + else > + cpu_based_vm_exec_control&= ~CPU_BASED_RDTSC_EXITING; > + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); > +} > + > static int handle_rmode_exception(struct kvm_vcpu *vcpu, > int vec, u32 err_code) > { > @@ -4497,6 +4516,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) > return 1; > } > > +static int handle_rdtsc(struct kvm_vcpu *vcpu) > +{ > + kvm_read_tsc(vcpu); > + return 1; > +} > + > static int handle_wbinvd(struct kvm_vcpu *vcpu) > { > skip_emulated_instruction(vcpu); > @@ -5421,6 +5446,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { > [EXIT_REASON_HLT] = handle_halt, > [EXIT_REASON_INVD] = handle_invd, > [EXIT_REASON_INVLPG] = handle_invlpg, > + [EXIT_REASON_RDTSC] = handle_rdtsc, > [EXIT_REASON_VMCALL] = handle_vmcall, > [EXIT_REASON_VMCLEAR] = handle_vmclear, > [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 09e67fb..1a07796 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -99,6 +99,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); > static u32 tsc_tolerance_ppm = 250; > module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > > +int kvm_software_tsc = 1; > +module_param_named(software_tsc_emulation, kvm_software_tsc, bool, 0644); > +EXPORT_SYMBOL_GPL(kvm_software_tsc); > + > #define KVM_NR_SHARED_MSRS 16 > > struct kvm_shared_msrs_global { > @@ -993,7 +997,8 @@ static inline u64 get_kernel_ns(void) > } > > static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); > -unsigned long max_tsc_khz; > +unsigned long kvm_max_tsc_khz; > +EXPORT_SYMBOL_GPL(kvm_max_tsc_khz); > > static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) > { > @@ -1001,7 +1006,7 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) > vcpu->arch.virtual_tsc_shift); > } > > -static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) > +void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) > { > u32 thresh_lo, thresh_hi; > int use_scaling = 0; > @@ -1026,6 +1031,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) > } > kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); > } > +EXPORT_SYMBOL_GPL(kvm_set_tsc_khz); > > static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) > { > @@ -1117,6 +1123,18 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) > > EXPORT_SYMBOL_GPL(kvm_write_tsc); > > +void kvm_read_tsc(struct kvm_vcpu *vcpu) > +{ > + u64 tsc; > + s64 kernel_ns = get_kernel_ns(); > + > + tsc = compute_guest_tsc(vcpu, kernel_ns); > + kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)tsc); > + kvm_register_write(vcpu, VCPU_REGS_RDX, tsc>> 32); > + kvm_x86_ops->skip_emulated_instruction(vcpu); > +} > +EXPORT_SYMBOL_GPL(kvm_read_tsc); > + > static int kvm_guest_time_update(struct kvm_vcpu *v) > { > unsigned long flags; > @@ -4931,7 +4949,7 @@ static void kvm_timer_init(void) > { > int cpu; > > - max_tsc_khz = tsc_khz; > + kvm_max_tsc_khz = tsc_khz; > register_hotcpu_notifier(&kvmclock_cpu_notifier_block); > if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { > #ifdef CONFIG_CPU_FREQ > @@ -4940,13 +4958,13 @@ static void kvm_timer_init(void) > cpu = get_cpu(); > cpufreq_get_policy(&policy, cpu); > if (policy.cpuinfo.max_freq) > - max_tsc_khz = policy.cpuinfo.max_freq; > + kvm_max_tsc_khz = policy.cpuinfo.max_freq; > put_cpu(); > #endif > cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, > CPUFREQ_TRANSITION_NOTIFIER); > } > - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); > + pr_debug("kvm: max_tsc_khz = %ld\n", kvm_max_tsc_khz); > for_each_online_cpu(cpu) > smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); > } > @@ -6194,10 +6212,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) > struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, > unsigned int id) > { > - if (check_tsc_unstable()&& atomic_read(&kvm->online_vcpus) != 0) > - printk_once(KERN_WARNING > - "kvm: SMP vm created on host with unstable TSC; " > - "guest TSC will not be reliable\n"); > return kvm_x86_ops->vcpu_create(kvm, id); > } > > @@ -6385,8 +6399,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > } > vcpu->arch.pio_data = page_address(page); > > - kvm_set_tsc_khz(vcpu, max_tsc_khz); > - > r = kvm_mmu_create(vcpu); > if (r< 0) > goto fail_free_pio_data; > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 256da82..94780df 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -80,6 +80,10 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); > int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); > > void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); > +void kvm_read_tsc(struct kvm_vcpu *vcpu); > +void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz); > +extern int kvm_software_tsc; > +extern unsigned long kvm_max_tsc_khz; > > int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, > gva_t addr, void *val, unsigned int bytes, > @@ -89,4 +93,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, > gva_t addr, void *val, unsigned int bytes, > struct x86_exception *exception); > > + > #endif > -- > 1.7.1 > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html