Bad things happen if a guest using the TSC deadline timer is migrated. The guest doesn't re-calibrate the TSC after migration, and the TSC frequency can and will change unless your processor supports TSC scaling (on Intel this is only Skylake) or your data center is perfectly homogeneous. The solution in this patch is to skip tsc_khz, and instead derive the frequency from kvmclock's (mult, shift) pair. Because kvmclock parameters convert from tsc to nanoseconds, this needs a division but that's the least of our problems when the TSC_DEADLINE_MSR write costs 2000 clock cycles. Luckily tsc_khz is really used by very little outside the tsc clocksource (which kvmclock replaces) and the TSC deadline timer. This patch does not handle the very first deadline, hoping that it is masked by the migration downtime (i.e. that the timer fires late anyway). I'd like a remark on the approach in general and ideas on how to handle the first deadline. It's also possible to just blacklist the TSC deadline timer of course, and it's probably the best thing to do for stable kernels. It would require extending to other modes the implementation of preemption-timer based APIC timer. It'd be a pity to lose the nice latency boost that the preemption timer offers. The patch is also quite ugly in the way it arranges for kvmclock to replace only a small part of setup_apic_timer; better ideas are welcome. Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/include/asm/x86_init.h | 5 +++ arch/x86/kernel/apic/apic.c | 15 +++++--- arch/x86/kernel/kvmclock.c | 78 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/x86_init.c | 1 + 5 files changed, 96 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bc27611fa58f..cc4f45f66218 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); +extern void setup_APIC_clockev(struct clock_event_device *levt); extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop +# define setup_APIC_clockev NULL #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4dcdf74dfed8..d0f099ab4dba 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -7,6 +7,7 @@ struct mpc_bus; struct mpc_cpu; struct mpc_table; struct cpuinfo_x86; +struct clock_event_device; /** * struct x86_init_mpparse - platform specific mpparse ops @@ -84,11 +85,15 @@ struct x86_init_paging { * boot cpu * @timer_init: initialize the platform timer (default PIT/HPET) * @wallclock_init: init the wallclock device + * @setup_APIC_clockev: tweak the clock_event_device for the LAPIC timer, + * if setup_boot_APIC_clock and/or + * setup_secondary_APIC_clock are in use */ struct x86_init_timers { void (*setup_percpu_clockev)(void); void (*timer_init)(void); void (*wallclock_init)(void); + void (*setup_APIC_clockev)(struct clock_event_device *levt); }; /** diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 60078a67d7e3..b7a331f329d0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -558,15 +558,20 @@ static void setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); + x86_init.timers.setup_APIC_clockev(levt); + clockevents_register_device(levt); +} + +void setup_APIC_clockev(struct clock_event_device *levt) +{ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY); + levt->min_delta_ticks = 0xF; + levt->max_delta_ticks = ~0UL; levt->set_next_event = lapic_next_deadline; - clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, - 0xF, ~0UL); - } else - clockevents_register_device(levt); + clockevents_config(levt, (tsc_khz / TSC_DIVISOR) * 1000); + } } /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..61e094207339 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -17,6 +17,7 @@ */ #include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/kvm_para.h> #include <asm/pvclock.h> #include <asm/msr.h> @@ -245,6 +246,82 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +/* + * We already have the inverse of the (mult,shift) pair, though this means + * we need a division. To avoid it we could compute a multiplicative inverse + * every time src->version changes. + */ +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38 +#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1) + +static int lapic_next_ktime(ktime_t expires, + struct clock_event_device *evt) +{ + u64 ns, tsc; + u32 version; + int cpu; + struct pvclock_vcpu_time_info *src; + + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + ns = ktime_to_ns(expires); + do { + u64 delta_ns; + int shift; + + version = src->version; + virt_rmb(); + if (unlikely(ns < src->system_time)) { + tsc = src->tsc_timestamp; + virt_rmb(); + continue; + } + + delta_ns = ns - src->system_time; + + /* Cap the wait to avoid overflow. */ + if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX)) + delta_ns = KVMCLOCK_TSC_DEADLINE_MAX; + + /* + * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul. + * The shift is split in two steps so that a 38 bits (275 s) + * deadline fits into the 64-bit dividend. + */ + shift = 32 - src->tsc_shift; + + /* First shift step... */ + delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; + shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; + + /* ... division... */ + tsc = div_u64(delta_ns, src->tsc_to_system_mul); + + /* ... and second shift step for the remaining bits. */ + if (shift >= 0) + tsc <<= shift; + else + tsc >>= -shift; + + tsc += src->tsc_timestamp; + virt_rmb(); + } while((src->version & 1) || version != src->version); + + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc); + return 0; +} + + +static void kvm_setup_tsc_deadline_timer(struct clock_event_device *levt) +{ + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->features |= CLOCK_EVT_FEAT_KTIME; + levt->set_next_ktime = lapic_next_ktime; + } +} + void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; @@ -288,6 +365,7 @@ void __init kvmclock_init(void) kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); put_cpu(); + x86_init.timers.setup_APIC_clockev = kvm_setup_tsc_deadline_timer; x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index dad5fe9633a3..b85a323208dc 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, .wallclock_init = x86_init_noop, + .setup_APIC_clockev = setup_APIC_clockev, }, .iommu = { -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html