Re: [RFC PATCH] x86, kvm: use kvmclock to compute TSC deadline value

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jul 5, 2016 at 10:36 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:
> Bad things happen if a guest using the TSC deadline timer is migrated.
> The guest doesn't re-calibrate the TSC after migration, and the
> TSC frequency can and will change unless your processor supports TSC
> scaling (on Intel this is only Skylake) or your data center is perfectly
> homogeneous.
>
> The solution in this patch is to skip tsc_khz, and instead derive the
> frequency from kvmclock's (mult, shift) pair.  Because kvmclock
> parameters convert from tsc to nanoseconds, this needs a division
> but that's the least of our problems when the TSC_DEADLINE_MSR write
> costs 2000 clock cycles.  Luckily tsc_khz is really used by very little
> outside the tsc clocksource (which kvmclock replaces) and the TSC
> deadline timer.

Two other ways to solve the problem, I don't know if you've considered:
* Constrain the set of hosts a given VM can run on based on the TSC
rate. (So don't need a perfectly homogenous fleet, just need each VM
to be constrained to a homogenous subset).
* Disable the TSC deadline timer from QEMU by assigning a CPUID with
the TSC capability zeroed (at least among VMs which could migrate to
hosts with different TSC rates). These VMs will use the APIC timer
which runs at a nice fixed rate.

>
> This patch does not handle the very first deadline, hoping that it
> is masked by the migration downtime (i.e. that the timer fires late
> anyway).  I'd like a remark on the approach in general and ideas on how
> to handle the first deadline.  It's also possible to just blacklist the
> TSC deadline timer of course, and it's probably the best thing to do for
> stable kernels.

> It would require extending to other modes the
> implementation of preemption-timer based APIC timer.

This would be nice to have.

> It'd be a pity to
> lose the nice latency boost that the preemption timer offers.
>
> The patch is also quite ugly in the way it arranges for kvmclock to
> replace only a small part of setup_apic_timer; better ideas are welcome.
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/apic.h     |  2 ++
>  arch/x86/include/asm/x86_init.h |  5 +++
>  arch/x86/kernel/apic/apic.c     | 15 +++++---
>  arch/x86/kernel/kvmclock.c      | 78 +++++++++++++++++++++++++++++++++++++++++
>  arch/x86/kernel/x86_init.c      |  1 +
>  5 files changed, 96 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index bc27611fa58f..cc4f45f66218 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -135,6 +135,7 @@ extern void init_apic_mappings(void);
>  void register_lapic_address(unsigned long address);
>  extern void setup_boot_APIC_clock(void);
>  extern void setup_secondary_APIC_clock(void);
> +extern void setup_APIC_clockev(struct clock_event_device *levt);
>  extern int APIC_init_uniprocessor(void);
>
>  #ifdef CONFIG_X86_64
> @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { }
>  static inline void disable_local_APIC(void) { }
>  # define setup_boot_APIC_clock x86_init_noop
>  # define setup_secondary_APIC_clock x86_init_noop
> +# define setup_APIC_clockev NULL
>  #endif /* !CONFIG_X86_LOCAL_APIC */
>
>  #ifdef CONFIG_X86_X2APIC
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 4dcdf74dfed8..d0f099ab4dba 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -7,6 +7,7 @@ struct mpc_bus;
>  struct mpc_cpu;
>  struct mpc_table;
>  struct cpuinfo_x86;
> +struct clock_event_device;
>
>  /**
>   * struct x86_init_mpparse - platform specific mpparse ops
> @@ -84,11 +85,15 @@ struct x86_init_paging {
>   *                             boot cpu
>   * @timer_init:                        initialize the platform timer (default PIT/HPET)
>   * @wallclock_init:            init the wallclock device
> + * @setup_APIC_clockev:         tweak the clock_event_device for the LAPIC timer,
> + *                              if setup_boot_APIC_clock and/or
> + *                              setup_secondary_APIC_clock are in use
>   */
>  struct x86_init_timers {
>         void (*setup_percpu_clockev)(void);
>         void (*timer_init)(void);
>         void (*wallclock_init)(void);
> +       void (*setup_APIC_clockev)(struct clock_event_device *levt);
>  };
>
>  /**
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 60078a67d7e3..b7a331f329d0 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -558,15 +558,20 @@ static void setup_APIC_timer(void)
>         memcpy(levt, &lapic_clockevent, sizeof(*levt));
>         levt->cpumask = cpumask_of(smp_processor_id());
>
> +       x86_init.timers.setup_APIC_clockev(levt);
> +       clockevents_register_device(levt);
> +}
> +
> +void setup_APIC_clockev(struct clock_event_device *levt)
> +{
>         if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
>                 levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
>                                     CLOCK_EVT_FEAT_DUMMY);
> +               levt->min_delta_ticks = 0xF;
> +               levt->max_delta_ticks = ~0UL;
>                 levt->set_next_event = lapic_next_deadline;
> -               clockevents_config_and_register(levt,
> -                                               (tsc_khz / TSC_DIVISOR) * 1000,
> -                                               0xF, ~0UL);
> -       } else
> -               clockevents_register_device(levt);
> +               clockevents_config(levt, (tsc_khz / TSC_DIVISOR) * 1000);
> +       }
>  }
>
>  /*
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1d39bfbd26bb..61e094207339 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -17,6 +17,7 @@
>  */
>
>  #include <linux/clocksource.h>
> +#include <linux/clockchips.h>
>  #include <linux/kvm_para.h>
>  #include <asm/pvclock.h>
>  #include <asm/msr.h>
> @@ -245,6 +246,82 @@ static void kvm_shutdown(void)
>         native_machine_shutdown();
>  }
>
> +/*
> + * We already have the inverse of the (mult,shift) pair, though this means
> + * we need a division.  To avoid it we could compute a multiplicative inverse
> + * every time src->version changes.
> + */
> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
> +#define KVMCLOCK_TSC_DEADLINE_MAX      ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
> +
> +static int lapic_next_ktime(ktime_t expires,
> +                           struct clock_event_device *evt)
> +{
> +       u64 ns, tsc;
> +       u32 version;
> +       int cpu;
> +       struct pvclock_vcpu_time_info *src;
> +
> +       cpu = smp_processor_id();
> +       src = &hv_clock[cpu].pvti;
> +       ns = ktime_to_ns(expires);
> +       do {
> +               u64 delta_ns;
> +               int shift;
> +
> +               version = src->version;
> +               virt_rmb();
> +               if (unlikely(ns < src->system_time)) {
> +                       tsc = src->tsc_timestamp;
> +                       virt_rmb();
> +                       continue;
> +               }
> +
> +               delta_ns = ns - src->system_time;
> +
> +               /* Cap the wait to avoid overflow.  */
> +               if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
> +                       delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
> +
> +               /*
> +                * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
> +                * The shift is split in two steps so that a 38 bits (275 s)
> +                * deadline fits into the 64-bit dividend.
> +                */
> +               shift = 32 - src->tsc_shift;
> +
> +               /* First shift step... */
> +               delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +               shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +
> +               /* ... division... */
> +               tsc = div_u64(delta_ns, src->tsc_to_system_mul);
> +
> +               /* ... and second shift step for the remaining bits.  */
> +               if (shift >= 0)
> +                       tsc <<= shift;
> +               else
> +                       tsc >>= -shift;
> +
> +               tsc += src->tsc_timestamp;
> +               virt_rmb();
> +       } while((src->version & 1) || version != src->version);
> +
> +       wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
> +       return 0;
> +}
> +
> +
> +static void kvm_setup_tsc_deadline_timer(struct clock_event_device *levt)
> +{
> +       if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
> +               levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
> +                                       CLOCK_EVT_FEAT_DUMMY);
> +               levt->features |= CLOCK_EVT_FEAT_KTIME;
> +               levt->set_next_ktime = lapic_next_ktime;
> +       }
> +}
> +
>  void __init kvmclock_init(void)
>  {
>         struct pvclock_vcpu_time_info *vcpu_time;
> @@ -288,6 +365,7 @@ void __init kvmclock_init(void)
>         kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
>         put_cpu();
>
> +       x86_init.timers.setup_APIC_clockev = kvm_setup_tsc_deadline_timer;
>         x86_platform.calibrate_tsc = kvm_get_tsc_khz;
>         x86_platform.get_wallclock = kvm_get_wallclock;
>         x86_platform.set_wallclock = kvm_set_wallclock;
> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
> index dad5fe9633a3..b85a323208dc 100644
> --- a/arch/x86/kernel/x86_init.c
> +++ b/arch/x86/kernel/x86_init.c
> @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
>                 .setup_percpu_clockev   = setup_boot_APIC_clock,
>                 .timer_init             = hpet_time_init,
>                 .wallclock_init         = x86_init_noop,
> +               .setup_APIC_clockev     = setup_APIC_clockev,
>         },
>
>         .iommu = {
> --
> 1.8.3.1
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux