Most of the codes were borrowed from arxh/x86/kernel/kvmclock.c. A special bit: PV_CLOCK_CYCLE_RAW_TEST_BIT is used to notify the driver to return unadjusted cycles. Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx> --- x86/kvmclock.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ x86/kvmclock.h | 60 ++++++++++++ 2 files changed, 339 insertions(+), 0 deletions(-) create mode 100644 x86/kvmclock.c create mode 100644 x86/kvmclock.h diff --git a/x86/kvmclock.c b/x86/kvmclock.c new file mode 100644 index 0000000..0624da3 --- /dev/null +++ b/x86/kvmclock.c @@ -0,0 +1,279 @@ +#include "libcflat.h" +#include "smp.h" +#include "atomic.h" +#include "processor.h" +#include "kvmclock.h" + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) + + +struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; +struct pvclock_wall_clock wall_clock; +static unsigned char valid_flags = 0; +static atomic64_t last_value = ATOMIC64_INIT(0); + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +#ifdef __i386__ +# define do_div(n,base) ({ \ + u32 __base = (base); \ + u32 __rem; \ + __rem = ((u64)(n)) % __base; \ + (n) = ((u64)(n)) / __base; \ + __rem; \ + }) +#else +u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) +{ + u64 rem = *n; + u64 b = base; + u64 res, d = 1; + u32 high = rem >> 32; + + /* Reduce the thing a bit first */ + res = 0; + if (high >= base) { + high /= base; + res = (u64) high << 32; + rem -= (u64) (high*base) << 32; + } + + while ((s64)b > 0 && b < rem) { + b = b+b; + d = d+d; + } + + do { + if (rem >= b) { + rem -= b; + res += d; + } + b >>= 1; + d >>= 1; + } while (d); + + *n = res; + return rem; +} + +# define do_div(n,base) ({ \ + u32 __base = (base); \ + u32 __rem; \ + (void)(((typeof((n)) *)0) == ((u64 *)0)); \ + if (likely(((n) >> 32) == 0)) { \ + __rem = (u32)(n) % __base; \ + (n) = (u32)(n) / __base; \ + } else \ + __rem = __div64_32(&(n), __base); \ + __rem; \ + }) +#endif + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = rdtsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct pvclock_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + dst->flags = src->flags; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) || (dst->version != src->version)); + + return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret, offset; + u64 last; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + offset = pvclock_get_nsec_offset(&shadow); + ret = shadow.system_timestamp + offset; + barrier(); + } while (version != src->version); + + if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || + ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (shadow.flags & PVCLOCK_TSC_STABLE_BIT))) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + + return ret; +} + +cycle_t kvm_clock_read() +{ + struct pvclock_vcpu_time_info *src; + cycle_t ret; + int index = smp_id(); + + src = &hv_clock[index]; + ret = pvclock_clocksource_read(src); + return ret; +} + +void kvm_clock_init(void *data) +{ + int index = smp_id(); + struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; + + printf("kvm-clock: cpu %d, msr 0x:%lx \n", index, hvc); + wrmsr(MSR_KVM_SYSTEM_TIME, (unsigned long)hvc | 1); +} + +void kvm_clock_clear(void *data) +{ + wrmsr(MSR_KVM_SYSTEM_TIME, 0LL); +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +void kvm_get_wallclock(struct timespec *ts) +{ + struct pvclock_vcpu_time_info *vcpu_time; + int index = smp_id(); + + wrmsr(MSR_KVM_WALL_CLOCK, (unsigned long)&wall_clock); + vcpu_time = &hv_clock[index]; + pvclock_read_wallclock(&wall_clock, vcpu_time, ts); +} + +void pvclock_set_flags(unsigned char flags) +{ + valid_flags = flags; +} diff --git a/x86/kvmclock.h b/x86/kvmclock.h new file mode 100644 index 0000000..166a338 --- /dev/null +++ b/x86/kvmclock.h @@ -0,0 +1,60 @@ +#ifndef KVMCLOCK_H +#define KVMCLOCK_H + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define MAX_CPU 64 + +#define PVCLOCK_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_RAW_CYCLE_BIT (1 << 7) /* Get raw cycle */ + +# define NSEC_PER_SEC 1000000000ULL + +typedef u64 cycle_t; + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; + u8 flags; +}; + + +struct timespec { + long tv_sec; + long tv_nsec; +}; + +void pvclock_set_flags(unsigned char flags); +cycle_t kvm_clock_read(); +void kvm_get_wallclock(struct timespec *ts); +void kvm_clock_init(void *data); +void kvm_clock_clear(void *data); + +#endif -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html