On Tue, Jul 5, 2016 at 10:36 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote: > Bad things happen if a guest using the TSC deadline timer is migrated. > The guest doesn't re-calibrate the TSC after migration, and the > TSC frequency can and will change unless your processor supports TSC > scaling (on Intel this is only Skylake) or your data center is perfectly > homogeneous. > > The solution in this patch is to skip tsc_khz, and instead derive the > frequency from kvmclock's (mult, shift) pair. Because kvmclock > parameters convert from tsc to nanoseconds, this needs a division > but that's the least of our problems when the TSC_DEADLINE_MSR write > costs 2000 clock cycles. Luckily tsc_khz is really used by very little > outside the tsc clocksource (which kvmclock replaces) and the TSC > deadline timer. Two other ways to solve the problem, I don't know if you've considered: * Constrain the set of hosts a given VM can run on based on the TSC rate. (So don't need a perfectly homogenous fleet, just need each VM to be constrained to a homogenous subset). * Disable the TSC deadline timer from QEMU by assigning a CPUID with the TSC capability zeroed (at least among VMs which could migrate to hosts with different TSC rates). These VMs will use the APIC timer which runs at a nice fixed rate. > > This patch does not handle the very first deadline, hoping that it > is masked by the migration downtime (i.e. that the timer fires late > anyway). I'd like a remark on the approach in general and ideas on how > to handle the first deadline. It's also possible to just blacklist the > TSC deadline timer of course, and it's probably the best thing to do for > stable kernels. > It would require extending to other modes the > implementation of preemption-timer based APIC timer. This would be nice to have. > It'd be a pity to > lose the nice latency boost that the preemption timer offers. > > The patch is also quite ugly in the way it arranges for kvmclock to > replace only a small part of setup_apic_timer; better ideas are welcome. > > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > --- > arch/x86/include/asm/apic.h | 2 ++ > arch/x86/include/asm/x86_init.h | 5 +++ > arch/x86/kernel/apic/apic.c | 15 +++++--- > arch/x86/kernel/kvmclock.c | 78 +++++++++++++++++++++++++++++++++++++++++ > arch/x86/kernel/x86_init.c | 1 + > 5 files changed, 96 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h > index bc27611fa58f..cc4f45f66218 100644 > --- a/arch/x86/include/asm/apic.h > +++ b/arch/x86/include/asm/apic.h > @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); > void register_lapic_address(unsigned long address); > extern void setup_boot_APIC_clock(void); > extern void setup_secondary_APIC_clock(void); > +extern void setup_APIC_clockev(struct clock_event_device *levt); > extern int APIC_init_uniprocessor(void); > > #ifdef CONFIG_X86_64 > @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } > static inline void disable_local_APIC(void) { } > # define setup_boot_APIC_clock x86_init_noop > # define setup_secondary_APIC_clock x86_init_noop > +# define setup_APIC_clockev NULL > #endif /* !CONFIG_X86_LOCAL_APIC */ > > #ifdef CONFIG_X86_X2APIC > diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h > index 4dcdf74dfed8..d0f099ab4dba 100644 > --- a/arch/x86/include/asm/x86_init.h > +++ b/arch/x86/include/asm/x86_init.h > @@ -7,6 +7,7 @@ struct mpc_bus; > struct mpc_cpu; > struct mpc_table; > struct cpuinfo_x86; > +struct clock_event_device; > > /** > * struct x86_init_mpparse - platform specific mpparse ops > @@ -84,11 +85,15 @@ struct x86_init_paging { > * boot cpu > * @timer_init: initialize the platform timer (default PIT/HPET) > * @wallclock_init: init the wallclock device > + * @setup_APIC_clockev: tweak the clock_event_device for the LAPIC timer, > + * if setup_boot_APIC_clock and/or > + * setup_secondary_APIC_clock are in use > */ > struct x86_init_timers { > void (*setup_percpu_clockev)(void); > void (*timer_init)(void); > void (*wallclock_init)(void); > + void (*setup_APIC_clockev)(struct clock_event_device *levt); > }; > > /** > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c > index 60078a67d7e3..b7a331f329d0 100644 > --- a/arch/x86/kernel/apic/apic.c > +++ b/arch/x86/kernel/apic/apic.c > @@ -558,15 +558,20 @@ static void setup_APIC_timer(void) > memcpy(levt, &lapic_clockevent, sizeof(*levt)); > levt->cpumask = cpumask_of(smp_processor_id()); > > + x86_init.timers.setup_APIC_clockev(levt); > + clockevents_register_device(levt); > +} > + > +void setup_APIC_clockev(struct clock_event_device *levt) > +{ > if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { > levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | > CLOCK_EVT_FEAT_DUMMY); > + levt->min_delta_ticks = 0xF; > + levt->max_delta_ticks = ~0UL; > levt->set_next_event = lapic_next_deadline; > - clockevents_config_and_register(levt, > - (tsc_khz / TSC_DIVISOR) * 1000, > - 0xF, ~0UL); > - } else > - clockevents_register_device(levt); > + clockevents_config(levt, (tsc_khz / TSC_DIVISOR) * 1000); > + } > } > > /* > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index 1d39bfbd26bb..61e094207339 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -17,6 +17,7 @@ > */ > > #include <linux/clocksource.h> > +#include <linux/clockchips.h> > #include <linux/kvm_para.h> > #include <asm/pvclock.h> > #include <asm/msr.h> > @@ -245,6 +246,82 @@ static void kvm_shutdown(void) > native_machine_shutdown(); > } > > +/* > + * We already have the inverse of the (mult,shift) pair, though this means > + * we need a division. To avoid it we could compute a multiplicative inverse > + * every time src->version changes. > + */ > +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38 > +#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1) > + > +static int lapic_next_ktime(ktime_t expires, > + struct clock_event_device *evt) > +{ > + u64 ns, tsc; > + u32 version; > + int cpu; > + struct pvclock_vcpu_time_info *src; > + > + cpu = smp_processor_id(); > + src = &hv_clock[cpu].pvti; > + ns = ktime_to_ns(expires); > + do { > + u64 delta_ns; > + int shift; > + > + version = src->version; > + virt_rmb(); > + if (unlikely(ns < src->system_time)) { > + tsc = src->tsc_timestamp; > + virt_rmb(); > + continue; > + } > + > + delta_ns = ns - src->system_time; > + > + /* Cap the wait to avoid overflow. */ > + if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX)) > + delta_ns = KVMCLOCK_TSC_DEADLINE_MAX; > + > + /* > + * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul. > + * The shift is split in two steps so that a 38 bits (275 s) > + * deadline fits into the 64-bit dividend. > + */ > + shift = 32 - src->tsc_shift; > + > + /* First shift step... */ > + delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; > + shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS; > + > + /* ... division... */ > + tsc = div_u64(delta_ns, src->tsc_to_system_mul); > + > + /* ... and second shift step for the remaining bits. */ > + if (shift >= 0) > + tsc <<= shift; > + else > + tsc >>= -shift; > + > + tsc += src->tsc_timestamp; > + virt_rmb(); > + } while((src->version & 1) || version != src->version); > + > + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc); > + return 0; > +} > + > + > +static void kvm_setup_tsc_deadline_timer(struct clock_event_device *levt) > +{ > + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { > + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | > + CLOCK_EVT_FEAT_DUMMY); > + levt->features |= CLOCK_EVT_FEAT_KTIME; > + levt->set_next_ktime = lapic_next_ktime; > + } > +} > + > void __init kvmclock_init(void) > { > struct pvclock_vcpu_time_info *vcpu_time; > @@ -288,6 +365,7 @@ void __init kvmclock_init(void) > kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); > put_cpu(); > > + x86_init.timers.setup_APIC_clockev = kvm_setup_tsc_deadline_timer; > x86_platform.calibrate_tsc = kvm_get_tsc_khz; > x86_platform.get_wallclock = kvm_get_wallclock; > x86_platform.set_wallclock = kvm_set_wallclock; > diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c > index dad5fe9633a3..b85a323208dc 100644 > --- a/arch/x86/kernel/x86_init.c > +++ b/arch/x86/kernel/x86_init.c > @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = { > .setup_percpu_clockev = setup_boot_APIC_clock, > .timer_init = hpet_time_init, > .wallclock_init = x86_init_noop, > + .setup_APIC_clockev = setup_APIC_clockev, > }, > > .iommu = { > -- > 1.8.3.1 > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html