On 09/09/2016 00:13, David Matlack wrote: > Hi Paolo, > > On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote: >> Bad things happen if a guest using the TSC deadline timer is migrated. >> The guest doesn't re-calibrate the TSC after migration, and the >> TSC frequency can and will change unless your processor supports TSC >> scaling (on Intel this is only Skylake) or your data center is perfectly >> homogeneous. > > Sorry, I forgot to follow up on our discussion in v1. One thing we > discussed there was using the APIC Timer to workaround a changing TSC > rate. You pointed out KVM's TSC deadline timer got a nice performance > boost recently, which makes it preferable. Does it makes sense to > apply the same optimization (using the VMX preemption timer) to the > APIC Timer, instead of creating a new dependency on kvmclock? Hi, yes it does. If we go that way kvmclock.c should be patched to blacklist the TSC deadline timer. However, I won't have time to work on it anytime soon, so _if I get reviews_ I'll take this patch first. Thanks, Paolo >> >> The solution in this patch is to skip tsc_khz, and instead derive the >> frequency from kvmclock's (mult, shift) pair. Because kvmclock >> parameters convert from tsc to nanoseconds, this needs a division >> but that's the least of our problems when the TSC_DEADLINE_MSR write >> costs 2000 clock cycles. Luckily tsc_khz is really used by very little >> outside the tsc clocksource (which kvmclock replaces) and the TSC >> deadline timer. Because KVM's local APIC doesn't need quirks, we >> provide a paravirt clockevent that still uses the deadline timer >> under the hood (as suggested by Andy Lutomirski). >> >> This patch does not handle the very first deadline, hoping that it >> is masked by the migration downtime (i.e. that the timer fires late >> anyway). >> >> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> >> --- >> arch/x86/include/asm/apic.h | 1 + >> arch/x86/kernel/apic/apic.c | 2 +- >> arch/x86/kernel/kvmclock.c | 156 ++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 158 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h >> index f6e0bad1cde2..c88b0dcfdf3a 100644 >> --- a/arch/x86/include/asm/apic.h >> +++ b/arch/x86/include/asm/apic.h >> @@ -53,6 +53,7 @@ extern unsigned int apic_verbosity; >> extern int local_apic_timer_c2_ok; >> >> extern int disable_apic; >> +extern int disable_apic_timer; >> extern unsigned int lapic_timer_frequency; >> >> #ifdef CONFIG_SMP >> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c >> index 5b63bec7d0af..d0c6d1e3d627 100644 >> --- a/arch/x86/kernel/apic/apic.c >> +++ b/arch/x86/kernel/apic/apic.c >> @@ -169,7 +169,7 @@ __setup("apicpmtimer", setup_apicpmtimer); >> unsigned long mp_lapic_addr; >> int disable_apic; >> /* Disable local APIC timer from the kernel commandline or via dmi quirk */ >> -static int disable_apic_timer __initdata; >> +int disable_apic_timer __initdata; >> /* Local APIC timer works in C2 */ >> int local_apic_timer_c2_ok; >> EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); >> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c >> index 1d39bfbd26bb..365fa6494dd3 100644 >> --- a/arch/x86/kernel/kvmclock.c >> +++ b/arch/x86/kernel/kvmclock.c >> @@ -17,6 +17,7 @@ >> */ >> >> #include <linux/clocksource.h> >> +#include <linux/clockchips.h> >> #include <linux/kvm_para.h> >> #include <asm/pvclock.h> >> #include <asm/msr.h> >> @@ -245,6 +246,155 @@ static void kvm_shutdown(void) >> native_machine_shutdown(); >> } >> >> +#ifdef CONFIG_X86_LOCAL_APIC >> +/* >> + * kvmclock-based clock event implementation, used only together with the >> + * TSC deadline timer. A subset of the normal LAPIC clockevent, but it >> + * uses kvmclock to convert nanoseconds to TSC. This is necessary to >> + * handle changes to the TSC frequency, e.g. from live migration. >> + */ >> + >> +static void kvmclock_lapic_timer_setup(unsigned lvtt_value) >> +{ >> + lvtt_value |= LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_TSCDEADLINE; >> + apic_write(APIC_LVTT, lvtt_value); >> +} >> + >> +static int kvmclock_lapic_timer_set_oneshot(struct clock_event_device *evt) >> +{ >> + kvmclock_lapic_timer_setup(0); >> + printk_once(KERN_DEBUG "kvmclock: TSC deadline timer enabled\n"); >> + >> + /* >> + * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode, >> + * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized. >> + * According to Intel, MFENCE can do the serialization here. >> + */ >> + asm volatile("mfence" : : : "memory"); >> + return 0; >> +} >> + >> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt) >> +{ >> + kvmclock_lapic_timer_setup(APIC_LVT_MASKED); >> + wrmsrl(MSR_IA32_TSC_DEADLINE, -1); >> + return 0; >> +} >> + >> +/* >> + * We already have the inverse of the (mult,shift) pair, though this means >> + * we need a division. To avoid it we could compute a multiplicative inverse >> + * every time src->version changes. >> + */ >> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38 >> +#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1) >> + >> +static int kvmclock_lapic_next_ktime(ktime_t expires, >> + struct clock_event_device *evt) >> +{ >> + u64 ns, tsc; >> + u32 version; >> + int cpu; >> + struct pvclock_vcpu_time_info *src; >> + >> + cpu = smp_processor_id(); >> + src = &hv_clock[cpu].pvti; >> + ns = ktime_to_ns(expires); >> + >> + do { >> + u64 delta_ns; >> + int shift; >> + >> + version = pvclock_read_begin(src); >> + if (unlikely(ns < src->system_time)) { >> + tsc = src->tsc_timestamp; >> + virt_rmb(); >> + continue; >> + } >> + >> + delta_ns = ns - src->system_time; >> + >> + /* Cap the wait to avoid overflow. */ >> + if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX)) >> + delta_ns = KVMCLOCK_TSC_DEADLINE_MAX; >> + >> + /* >> + * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul. >> + * The shift is split in two steps so that a 38 bits (275 s) >> + * deadline fits into the 64-bit dividend. >> + */ >> + shift = 32 - src->tsc_shift; >> + >> + /* First shift step... */ >> + delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; >> + shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; >> + >> + /* ... division... */ >> + tsc = div_u64(delta_ns, src->tsc_to_system_mul); >> + >> + /* ... and second shift step for the remaining bits. */ >> + if (shift >= 0) >> + tsc <<= shift; >> + else >> + tsc >>= -shift; >> + >> + tsc += src->tsc_timestamp; >> + } while (pvclock_read_retry(src, version)); >> + >> + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc); >> + return 0; >> +} >> + >> +/* >> + * The local apic timer can be used for any function which is CPU local. >> + */ >> +static struct clock_event_device kvm_clockevent = { >> + .name = "lapic", > > Should we encode "kvm" or "kvmclock" in the name? I'm not sure how > this name gets used, but it might be nice to distinguish it from the > native TSC deadline timer clock_event_device. > >> + /* Under KVM the LAPIC timer always runs in deep C-states. */ >> + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME, >> + .set_state_shutdown = kvmclock_lapic_timer_stop, >> + .set_state_oneshot = kvmclock_lapic_timer_set_oneshot, >> + .set_next_ktime = kvmclock_lapic_next_ktime, >> + .mult = 1, >> + /* Make LAPIC timer preferrable over percpu HPET */ >> + .rating = 150, >> + .irq = -1, >> +}; >> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events); >> + >> +static void kvmclock_local_apic_timer_interrupt(void) >> +{ >> + int cpu = smp_processor_id(); >> + struct clock_event_device *evt = &per_cpu(kvm_events, cpu); >> + >> + /* >> + * Defer to the native clockevent if ours hasn't been setup yet. >> + */ >> + if (!evt->event_handler) { >> + native_local_apic_timer_interrupt(); >> + return; >> + } >> + >> + inc_irq_stat(apic_timer_irqs); >> + evt->event_handler(evt); >> +} >> + >> +/* >> + * Setup the local APIC timer for this CPU. Copy the initialized values >> + * of the boot CPU and register the clock event in the framework. >> + */ >> +static void setup_kvmclock_timer(void) >> +{ >> + struct clock_event_device *evt = this_cpu_ptr(&kvm_events); >> + >> + kvmclock_lapic_timer_stop(evt); >> + >> + memcpy(evt, &kvm_clockevent, sizeof(*evt)); >> + evt->cpumask = cpumask_of(smp_processor_id()); >> + clockevents_register_device(evt); >> +} >> +#endif >> + >> void __init kvmclock_init(void) >> { >> struct pvclock_vcpu_time_info *vcpu_time; >> @@ -292,6 +442,12 @@ void __init kvmclock_init(void) >> x86_platform.get_wallclock = kvm_get_wallclock; >> x86_platform.set_wallclock = kvm_set_wallclock; >> #ifdef CONFIG_X86_LOCAL_APIC >> + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) && >> + !disable_apic && !disable_apic_timer) { > > Request that this be a hypervisor-controllable feature. e.g. we could > add a new bit to KVM's CPUID leaf to indicate kvmclock is the > definitive source of TSC rate. > >> + pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt; >> + x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer; >> + x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer; >> + } >> x86_cpuinit.early_percpu_clock_init = >> kvm_setup_secondary_clock; >> #endif >> -- >> 1.8.3.1 >> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html