Hi Joe, On 9/26/23 17:29, Joe Jin wrote: > On 9/26/23 4:06 PM, Dongli Zhang wrote: >> This is to minimize the kvmclock drift during CPU hotplug (or when the >> master clock and pvclock_vcpu_time_info are updated). The drift is >> because kvmclock and raw monotonic (tsc) use different >> equation/mult/shift to calculate that how many nanoseconds (given the tsc >> as input) has passed. >> >> The calculation of the kvmclock is based on the pvclock_vcpu_time_info >> provided by the host side. >> >> struct pvclock_vcpu_time_info { >> u32 version; >> u32 pad0; >> u64 tsc_timestamp; --> by host raw monotonic >> u64 system_time; --> by host raw monotonic >> u32 tsc_to_system_mul; --> by host KVM >> s8 tsc_shift; --> by host KVM >> u8 flags; >> u8 pad[2]; >> } __attribute__((__packed__)); >> >> To calculate the current guest kvmclock: >> >> 1. Obtain the tsc = rdtsc() of guest. >> >> 2. If shift < 0: >> tmp = tsc >> tsc_shift >> if shift > 0: >> tmp = tsc << tsc_shift >> >> 3. The kvmclock value will be: (tmp * tsc_to_system_mul) >> 32 >> >> Therefore, the current kvmclock will be either: >> >> (rdtsc() >> tsc_shift) * tsc_to_system_mul >> 32 >> >> ... or ... >> >> (rdtsc() << tsc_shift) * tsc_to_system_mul >> 32 >> >> The 'tsc_to_system_mul' and 'tsc_shift' are calculated by the host KVM. >> >> When the master clock is actively used, the 'tsc_timestamp' and >> 'system_time' are derived from the host raw monotonic time, which is >> calculated based on the 'mult' and 'shift' of clocksource_tsc: >> >> elapsed_time = (tsc * mult) >> shift >> >> Since kvmclock and raw monotonic (clocksource_tsc) use different >> equation/mult/shift to convert the tsc to nanosecond, there may be clock >> drift issue during CPU hotplug (when the master clock is updated). >> >> 1. The guest boots and all vcpus have the same 'pvclock_vcpu_time_info' >> (suppose the master clock is used). >> >> 2. Since the master clock is never updated, the periodic kvmclock_sync_work >> does not update the values in 'pvclock_vcpu_time_info'. >> >> 3. Suppose a very long period has passed (e.g., 30-day). >> >> 4. The user adds another vcpu. Both master clock and >> 'pvclock_vcpu_time_info' are updated, based on the raw monotonic. >> >> (Ideally, we expect the update is based on 'tsc_to_system_mul' and >> 'tsc_shift' from kvmclock). >> >> >> Because kvmclock and raw monotonic (clocksource_tsc) use different >> equation/mult/shift to convert the tsc to nanosecond, there will be drift >> between: >> >> (1) kvmclock based on current rdtsc and old 'pvclock_vcpu_time_info' >> (2) kvmclock based on current rdtsc and new 'pvclock_vcpu_time_info' >> >> According to the test, there is a drift of 4502145ns between (1) and (2) >> after about 40 hours. The longer the time, the large the drift. >> >> This is to add a module param to allow the user to configure for how often >> to refresh the master clock, in order to reduce the kvmclock drift based on >> user requirement (e.g., every 5-min to every day). The more often that the >> master clock is refreshed, the smaller the kvmclock drift during the vcpu >> hotplug. >> >> Cc: Joe Jin <joe.jin@xxxxxxxxxx> >> Signed-off-by: Dongli Zhang <dongli.zhang@xxxxxxxxxx> >> --- >> Other options are to update the masterclock in: >> - kvmclock_sync_work, or >> - pvclock_gtod_notify() >> >> arch/x86/include/asm/kvm_host.h | 1 + >> arch/x86/kvm/x86.c | 34 +++++++++++++++++++++++++++++++++ >> 2 files changed, 35 insertions(+) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 17715cb8731d..57409dce5d73 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -1331,6 +1331,7 @@ struct kvm_arch { >> u64 master_cycle_now; >> struct delayed_work kvmclock_update_work; >> struct delayed_work kvmclock_sync_work; >> + struct delayed_work masterclock_sync_work; >> >> struct kvm_xen_hvm_config xen_hvm_config; >> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 9f18b06bbda6..0b71dc3785eb 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -157,6 +157,9 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); >> static bool __read_mostly kvmclock_periodic_sync = true; >> module_param(kvmclock_periodic_sync, bool, S_IRUGO); >> >> +unsigned int __read_mostly masterclock_sync_period; >> +module_param(masterclock_sync_period, uint, 0444); > > Can the mode be 0644 and allow it be changed at runtime? It can be RW. So far I just copy from kvmclock_periodic_sync as most code are from the mechanism of kvmclock_periodic_sync. static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); Thank you very much! Dongli Zhang > > Thanks, > Joe >> + >> /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ >> static u32 __read_mostly tsc_tolerance_ppm = 250; >> module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); >> @@ -3298,6 +3301,31 @@ static void kvmclock_sync_fn(struct work_struct *work) >> KVMCLOCK_SYNC_PERIOD); >> } >> >> +static void masterclock_sync_fn(struct work_struct *work) >> +{ >> + unsigned long i; >> + struct delayed_work *dwork = to_delayed_work(work); >> + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, >> + masterclock_sync_work); >> + struct kvm *kvm = container_of(ka, struct kvm, arch); >> + struct kvm_vcpu *vcpu; >> + >> + if (!masterclock_sync_period) >> + return; >> + >> + kvm_for_each_vcpu(i, vcpu, kvm) { >> + /* >> + * It is not required to kick the vcpu because it is not >> + * expected to update the master clock immediately. >> + */ >> + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); >> + } >> + >> + schedule_delayed_work(&ka->masterclock_sync_work, >> + masterclock_sync_period * HZ); >> +} >> + >> + >> /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ >> static bool is_mci_control_msr(u32 msr) >> { >> @@ -11970,6 +11998,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) >> if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) >> schedule_delayed_work(&kvm->arch.kvmclock_sync_work, >> KVMCLOCK_SYNC_PERIOD); >> + >> + if (masterclock_sync_period && vcpu->vcpu_idx == 0) >> + schedule_delayed_work(&kvm->arch.masterclock_sync_work, >> + masterclock_sync_period * HZ); >> } >> >> void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) >> @@ -12344,6 +12376,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) >> >> INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); >> INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); >> + INIT_DELAYED_WORK(&kvm->arch.masterclock_sync_work, masterclock_sync_fn); >> >> kvm_apicv_init(kvm); >> kvm_hv_init_vm(kvm); >> @@ -12383,6 +12416,7 @@ static void kvm_unload_vcpu_mmus(struct kvm *kvm) >> >> void kvm_arch_sync_events(struct kvm *kvm) >> { >> + cancel_delayed_work_sync(&kvm->arch.masterclock_sync_work); >> cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); >> cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); >> kvm_free_pit(kvm); >