Use kvmclock for tsc calibration when running on kvm. Without this the tsc frequency calibrated by seabios can be *way* off in case the virtual machine is booted on a loaded host. I've seen seabios calibrating 27 instead of ca. 2800 MHz, resulting in timeouts being to short by factor 100. Which in turn leads to disk I/O errors due to timeouts, especially as I/O requests tend to take a bit longer than usual on a loaded box ... Signed-off-by: Gerd Hoffmann <kraxel@xxxxxxxxxx> --- src/clock.c | 9 +++++ src/paravirt.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/paravirt.h | 1 + 3 files changed, 100 insertions(+), 0 deletions(-) diff --git a/src/clock.c b/src/clock.c index 69e9f17..5883b1a 100644 --- a/src/clock.c +++ b/src/clock.c @@ -13,6 +13,7 @@ #include "bregs.h" // struct bregs #include "biosvar.h" // GET_GLOBAL #include "usb-hid.h" // usb_check_event +#include "paravirt.h" // kvm clock // RTC register flags #define RTC_A_UIP 0x80 @@ -80,6 +81,14 @@ calibrate_tsc(void) return; } + if (kvm_para_available()) { + u32 khz = kvm_tsc_khz(); + if (khz != 0) { + SET_GLOBAL(cpu_khz, khz); + return; + } + } + // Setup "timer2" u8 orig = inb(PORT_PS2_CTRLB); outb((orig & ~PPCB_SPKR) | PPCB_T2GATE, PORT_PS2_CTRLB); diff --git a/src/paravirt.c b/src/paravirt.c index 2a98d53..942ce11 100644 --- a/src/paravirt.c +++ b/src/paravirt.c @@ -12,6 +12,7 @@ #include "ioport.h" // outw #include "paravirt.h" // qemu_cfg_port_probe #include "smbios.h" // struct smbios_structure_header +#include "biosvar.h" // GET_GLOBAL int qemu_cfg_present; @@ -346,3 +347,92 @@ void qemu_cfg_romfile_setup(void) dprintf(3, "Found fw_cfg file: %s (size=%d)\n", file->name, file->size); } } + +#define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define MSR_KVM_SYSTEM_TIME 0x12 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; +} PACKED; + +/* + * do_div() is NOT a C function. It wants to return + * two values (the quotient and the remainder), but + * since that doesn't work very well in C, what it + * does is: + * + * - modifies the 64-bit dividend _in_place_ + * - returns the 32-bit remainder + * + * This ends up being the most efficient "calling + * convention" on x86. + */ +#define do_div(n, base) \ + ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + asm("" : "=a" (__low), "=d" (__high) : "A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + asm("divl %2" : "=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("" : "=A" (n) : "a" (__low), "d" (__high)); \ + __mod; \ + }) + +static u64 pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + +u64 kvm_tsc_khz(void) +{ + u32 eax, ebx, ecx, edx, msr; + struct pvclock_vcpu_time_info time; + u32 addr = (u32)(&time); + u64 khz; + + /* check presence and figure msr number */ + cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx); + if (eax & KVM_FEATURE_CLOCKSOURCE2) { + msr = MSR_KVM_SYSTEM_TIME_NEW; + } else if (eax & KVM_FEATURE_CLOCKSOURCE) { + msr = MSR_KVM_SYSTEM_TIME; + } else { + return 0; + } + + /* ask kvm hypervisor to fill struct */ + memset(&time, 0, sizeof(time)); + wrmsr(msr, addr | 1); + wrmsr(msr, 0); + if (time.version < 2 || time.tsc_to_system_mul == 0) + return 0; + + /* go figure tsc frequency */ + khz = pvclock_tsc_khz(&time); + dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n", + msr, (u32)khz / 1000); + return khz; +} diff --git a/src/paravirt.h b/src/paravirt.h index a284c41..eedfcc3 100644 --- a/src/paravirt.h +++ b/src/paravirt.h @@ -27,6 +27,7 @@ static inline int kvm_para_available(void) return 0; } +extern u64 kvm_tsc_khz(void); #define QEMU_CFG_SIGNATURE 0x00 #define QEMU_CFG_ID 0x01 -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html