Resync all CPUs to measure TSC skew periodically. Use the measured skew to adjust the resync time (not done yet - heuristic needed) Signed-off-by: Zachary Amsden <zamsden@xxxxxxxxxx> --- arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 90 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 792c895..3a854ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -750,9 +750,10 @@ struct cpu_tsc_vars u64 last_ref; }; static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars); - static int tsc_base_cpu = -1; static unsigned long ref_tsc_khz; +static u64 tsc_drift; +static struct timer_list resync_timer; static inline int cpu_is_tsc_synchronized(int cpu) { @@ -935,6 +936,7 @@ static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready) * Average and trim the samples of any outliers; we use > 2 x sigma */ static u64 tsc_deviation; +static u64 tsc_skew; static s64 average_samples(s64 *samples, unsigned num_samples) { unsigned i, j; @@ -993,10 +995,24 @@ static void kvm_sync_tsc(void *cpup) s64 *delta1, *delta2; static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1); struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, new_cpu); + static u64 old_base; + static s64 old_offset; + static unsigned long old_multiplier; + static unsigned int old_shift; BUG_ON(tsc_base_cpu == -1); local_irq_save(flags); + + /* + * First, the new CPU may be just coming up to sync or might have + * changed frequency, which means the measurement base must be + * adjusted. If not, we can use it to compute a skew estimate. + */ if (raw_smp_processor_id() == new_cpu) { + old_multiplier = cv->tsc_multiplier; + old_shift = cv->tsc_shift; + old_base = cv->tsc_measure_base; + old_offset = cv->tsc_offset; cv->tsc_measure_base = native_read_tsc(); cv->tsc_offset = 0; compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, @@ -1005,10 +1021,12 @@ static void kvm_sync_tsc(void *cpup) " tsc_base_cpu = %d\n", __func__, new_cpu, cv->tsc_khz, cv->tsc_measure_base, tsc_base_cpu); } + delta1 = per_cpu(delta_array, tsc_base_cpu).delta; delta2 = per_cpu(delta_array, new_cpu).delta; sync_tsc_helper(tsc_base_cpu, delta1, &ready); sync_tsc_helper(new_cpu, delta2, &ready); + if (raw_smp_processor_id() == new_cpu) { s64 accumulator = 0; @@ -1024,8 +1042,40 @@ static void kvm_sync_tsc(void *cpup) accumulator += average_samples(&delta1[2], SYNC_TRIES-3); accumulator -= average_samples(&delta2[2], SYNC_TRIES-3); accumulator /= 2; - cv->tsc_offset = accumulator; + + /* + * Skew can be computed over a constant multiplier as follows: + * + * ref_new = (tsc_new - base_new) * mult + off_new + * ref_old = (tsc_old - base_old) * mult + off_old + * + * skew = ref_new - (ref_old + delta_ref) + * + * skew = off_new - off_old + mult(tsc_new - tsc_old) + * - mult(base_new - base_old) - delta_ref + * + * The tsc_old / tsc_new values are not recoverable, but + * observe that mult(tsc_new - tsc_old) == delta_ref, so + * + * skew = delta(off) - mult(delta base) + * + * To avoid problems with signed computation, we multiply + * unsigned numbers first before switching to signed arithmetic + */ + if (old_multiplier == cv->tsc_multiplier && + old_shift == cv->tsc_shift) { + u64 sbo = old_base, sbn = cv->tsc_measure_base; + s64 skew; + sbo = mult_precise(sbo, old_multiplier, old_shift); + sbn = mult_precise(sbn, old_multiplier, old_shift); + skew = cv->tsc_offset - old_offset + (sbo - sbn); + if (skew < 0) + skew = -skew; + if (skew > tsc_skew) + tsc_skew = skew; + } + smp_wmb(); ++cv->tsc_generation; atomic_set(&cv->tsc_synchronized, 1); @@ -3611,6 +3661,8 @@ static long resync(void *unused) struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars); u64 tsc = 0; int cpu; + static unsigned long jif_old; + unsigned long jif_delta; /* * First, make sure we are on the right CPU; between when the work got @@ -3643,17 +3695,28 @@ static long resync(void *unused) cv->tsc_generation++; // XXX needed? */ compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier, &cv->tsc_shift); + tsc_skew = 0; atomic_set(&cv->tsc_synchronized, 1); + smp_wmb(); for_each_online_cpu(cpu) kvm_do_sync_tsc(cpu); + for_each_online_cpu(cpu) + while (!cpu_is_tsc_synchronized(cpu)) + cpu_relax(); + + smp_rmb(); + jif_delta = jiffies - jif_old; + pr_debug("max TSC skew now estimated at %llu over %lu jiffies\n", + tsc_skew, jif_delta); + jif_old = jiffies; + mod_timer(&resync_timer, jiffies + HZ * 50); put_cpu(); return 0; } static DEFINE_MUTEX(resync_lock); - static void resync_all(void) { mutex_lock(&resync_lock); @@ -3662,6 +3725,18 @@ static void resync_all(void) mutex_unlock(&resync_lock); } +static struct work_struct resync_work; +static void resync_work_fn(struct work_struct *work) +{ + resync_all(); +} + +static void resync_callout(unsigned long unused) +{ + INIT_WORK(&resync_work, resync_work_fn); + schedule_work(&resync_work); +} + static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -3836,6 +3911,15 @@ static void kvm_timer_init(void) for_each_possible_cpu(cpu) per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz; } + + /* + * Now, pick a CPU to make the master and synchronize all other + * CPUs to it's clock. Periodically check for drift as well. + * Our initial drift estimate is 1 ppm / sec. + */ + tsc_drift = ref_tsc_khz / 1000; + init_timer(&resync_timer); + resync_timer.function = resync_callout; tsc_base_cpu = get_cpu(); put_cpu(); resync_all(); @@ -3898,6 +3982,9 @@ void kvm_arch_exit(void) pci_write_config_byte(*nb, 0x87, disabled_c1_ramp); } #endif + mutex_lock(&resync_lock); + del_timer(&resync_timer); + mutex_unlock(&resync_lock); } int kvm_emulate_halt(struct kvm_vcpu *vcpu) -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html