[patch 3/5] kvmclock: stolen time aware sched_clock

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Fri, 16 Oct 2009 01:08:49 -0300

sched_clock() should time the vcpu run time. Subtract stolen time from
realtime pvclock.

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

Index: kvm/arch/x86/kernel/kvmclock.c
===================================================================

--- kvm.orig/arch/x86/kernel/kvmclock.c
+++ kvm/arch/x86/kernel/kvmclock.c
@@ -38,7 +38,16 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+struct time_info {
+	struct pvclock_vcpu_time_info hv_clock;
+	struct kvm_vcpu_runtime_info run_info;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct time_info, time_info);
+
+#define hv_clock time_info.hv_clock
+#define run_info time_info.run_info
+
 static struct pvclock_wall_clock wall_clock;
 
 /*
@@ -84,6 +93,40 @@ static cycle_t kvm_clock_get_cycles(stru
 	return kvm_clock_read();
 }
 
+cycle_t kvm_runtime_read(struct pvclock_vcpu_time_info *src,
+		  	 struct kvm_vcpu_runtime_info *rinfo)
+{
+	struct pvclock_shadow_time shadow;
+	unsigned version;
+	cycle_t ret, offset;
+	unsigned long long stolen;
+
+	do {
+		version = pvclock_get_time_values(&shadow, src);
+		barrier();
+		offset = pvclock_get_nsec_offset(&shadow);
+		stolen = rinfo->stolen_time;
+		ret = shadow.system_timestamp + offset - stolen;
+		barrier();
+	} while (version != src->version);
+
+	return ret;
+}
+
+static cycle_t kvm_clock_read_unstolen(void)
+{
+	struct pvclock_vcpu_time_info *src;
+	struct kvm_vcpu_runtime_info *rinfo;
+	cycle_t ret;
+
+	src = &get_cpu_var(hv_clock);
+	rinfo = &get_cpu_var(run_info);
+	ret = kvm_runtime_read(src, rinfo);
+	put_cpu_var(run_info);
+	put_cpu_var(hv_clock);
+	return ret;
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -133,14 +176,30 @@ static int kvm_register_clock(char *txt)
 	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
 
+static int kvm_register_run_info(char *txt)
+{
+	int cpu = smp_processor_id();
+	int low, high;
+
+	low = (int) __pa(&per_cpu(run_info, cpu)) | 1;
+	high = ((u64)__pa(&per_cpu(run_info, cpu)) >> 32);
+	printk(KERN_INFO "kvm-runtime-info: cpu %d, msr %x:%x, %s\n",
+	       cpu, high, low, txt);
+	return native_write_msr_safe(MSR_KVM_RUN_TIME, low, high);
+}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 static void __cpuinit kvm_setup_secondary_clock(void)
 {
+	char *txt = "secondary cpu clock";
+
 	/*
 	 * Now that the first cpu already had this clocksource initialized,
 	 * we shouldn't fail.
 	 */
-	WARN_ON(kvm_register_clock("secondary cpu clock"));
+	WARN_ON(kvm_register_clock(txt));
+	if (kvm_para_has_feature(KVM_FEATURE_RUNTIME_INFO))
+		kvm_register_run_info(txt);
 	/* ok, done with our trickery, call native */
 	setup_secondary_APIC_clock();
 }
@@ -149,7 +208,11 @@ static void __cpuinit kvm_setup_secondar
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
-	WARN_ON(kvm_register_clock("primary cpu clock"));
+	char *txt = "primary cpu clock";
+
+	WARN_ON(kvm_register_clock(txt));
+	if (kvm_para_has_feature(KVM_FEATURE_RUNTIME_INFO))
+		kvm_register_run_info(txt);
 	native_smp_prepare_boot_cpu();
 }
 #endif
@@ -204,4 +267,6 @@ void __init kvmclock_init(void)
 		pv_info.paravirt_enabled = 1;
 		pv_info.name = "KVM";
 	}
+	if (kvm_para_has_feature(KVM_FEATURE_RUNTIME_INFO))
+		pv_time_ops.sched_clock = kvm_clock_read_unstolen;
 }


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html