On Thu, Jul 29, 2021, Oliver Upton wrote: > To date, VMM-directed TSC synchronization and migration has been a bit > messy. KVM has some baked-in heuristics around TSC writes to infer if > the VMM is attempting to synchronize. This is problematic, as it depends > on host userspace writing to the guest's TSC within 1 second of the last > write. > > A much cleaner approach to configuring the guest's views of the TSC is to > simply migrate the TSC offset for every vCPU. Offsets are idempotent, > and thus not subject to change depending on when the VMM actually > reads/writes values from/to KVM. The VMM can then read the TSC once with > KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when > the guest is paused. > > Cc: David Matlack <dmatlack@xxxxxxxxxx> > Signed-off-by: Oliver Upton <oupton@xxxxxxxxxx> > --- > Documentation/virt/kvm/devices/vcpu.rst | 57 ++++++++ > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/include/uapi/asm/kvm.h | 4 + > arch/x86/kvm/x86.c | 167 ++++++++++++++++++++++++ > 4 files changed, 229 insertions(+) > > diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst > index 2acec3b9ef65..0f46f2588905 100644 > --- a/Documentation/virt/kvm/devices/vcpu.rst > +++ b/Documentation/virt/kvm/devices/vcpu.rst > @@ -161,3 +161,60 @@ Specifies the base address of the stolen time structure for this VCPU. The > base address must be 64 byte aligned and exist within a valid guest memory > region. See Documentation/virt/kvm/arm/pvtime.rst for more information > including the layout of the stolen time structure. > + > +4. GROUP: KVM_VCPU_TSC_CTRL > +=========================== > + > +:Architectures: x86 > + > +4.1 ATTRIBUTE: KVM_VCPU_TSC_OFFSET > + > +:Parameters: 64-bit unsigned TSC offset > + > +Returns: > + > + ======= ====================================== > + -EFAULT Error reading/writing the provided > + parameter address. There's a rogue space in here (git show highlights it). > + -ENXIO Attribute not supported > + ======= ====================================== > + > +Specifies the guest's TSC offset relative to the host's TSC. The guest's > +TSC is then derived by the following equation: > + > + guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET > + > +This attribute is useful for the precise migration of a guest's TSC. The > +following describes a possible algorithm to use for the migration of a > +guest's TSC: > + > +From the source VMM process: > + > +1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_0), > + kvmclock nanoseconds (k_0), and realtime nanoseconds (r_0). > + > +2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the > + guest TSC offset (off_n). > + > +3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the > + guest's TSC (freq). > + > +From the destination VMM process: > + > +4. Invoke the KVM_SET_CLOCK ioctl, providing the kvmclock nanoseconds > + (k_0) and realtime nanoseconds (r_0) in their respective fields. > + Ensure that the KVM_CLOCK_REALTIME flag is set in the provided > + structure. KVM will advance the VM's kvmclock to account for elapsed > + time since recording the clock values. > + > +5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_1) and > + kvmclock nanoseconds (k_1). > + > +6. Adjust the guest TSC offsets for every vCPU to account for (1) time > + elapsed since recording state and (2) difference in TSCs between the > + source and destination machine: > + > + new_off_n = t_0 + off_n + (k_1 - k_0) * freq - t_1 > + > +7. Write the KVM_VCPU_TSC_OFFSET attribute for every vCPU with the > + respective value derived in the previous step. > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 65a20c64f959..855698923dd0 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1070,6 +1070,7 @@ struct kvm_arch { > u64 last_tsc_nsec; > u64 last_tsc_write; > u32 last_tsc_khz; > + u64 last_tsc_offset; > u64 cur_tsc_nsec; > u64 cur_tsc_write; > u64 cur_tsc_offset; > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h > index a6c327f8ad9e..0b22e1e84e78 100644 > --- a/arch/x86/include/uapi/asm/kvm.h > +++ b/arch/x86/include/uapi/asm/kvm.h > @@ -503,4 +503,8 @@ struct kvm_pmu_event_filter { > #define KVM_PMU_EVENT_ALLOW 0 > #define KVM_PMU_EVENT_DENY 1 > > +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ > +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ > +#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ > + > #endif /* _ASM_X86_KVM_H */ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 27435a07fb46..17d87a8d0c75 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2413,6 +2413,11 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) > static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); > } > > +static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu) > +{ > + return vcpu->arch.l1_tsc_offset; > +} This is not a helpful, uh, helper. Based on the name, I would expect it to read the TSC_OFFSET in the context of L1 vs. L2. Assuming you really want L1's offset, just read vcpu->arch.l1_tsc_offset directly. That will obviate the "need" for a local offset (see below). > + > static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) > { > vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; > @@ -2469,6 +2474,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, > kvm->arch.last_tsc_nsec = ns; > kvm->arch.last_tsc_write = tsc; > kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; > + kvm->arch.last_tsc_offset = offset; > > vcpu->arch.last_guest_tsc = tsc; > > @@ -4928,6 +4934,137 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) > return 0; > } > > +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, > + struct kvm_device_attr *attr) > +{ > + int r; > + > + switch (attr->attr) { > + case KVM_VCPU_TSC_OFFSET: > + r = 0; > + break; > + default: > + r = -ENXIO; > + } > + > + return r; > +} > + > +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, > + struct kvm_device_attr *attr) > +{ > + void __user *uaddr = (void __user *)attr->addr; > + int r; > + > + switch (attr->attr) { > + case KVM_VCPU_TSC_OFFSET: { > + u64 offset; > + > + offset = kvm_vcpu_read_tsc_offset(vcpu); > + r = -EFAULT; > + if (copy_to_user(uaddr, &offset, sizeof(offset))) > + break; Use put_user(), then this all becomes short and sweet without curly braces: case KVM_VCPU_TSC_OFFSET: r = -EFAULT; if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) break; r = 0; break; > + > + r = 0; > + break; > + } > + default: > + r = -ENXIO; > + } > + > + return r; > +} > + > +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, > + struct kvm_device_attr *attr) > +{ > + void __user *uaddr = (void __user *)attr->addr; > + struct kvm *kvm = vcpu->kvm; > + int r; > + > + switch (attr->attr) { > + case KVM_VCPU_TSC_OFFSET: { > + u64 offset, tsc, ns; > + unsigned long flags; > + bool matched; > + > + r = -EFAULT; > + if (copy_from_user(&offset, uaddr, sizeof(offset))) This can be get_user(). > + break; > + > + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); > + > + matched = (vcpu->arch.virtual_tsc_khz && > + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && > + kvm->arch.last_tsc_offset == offset); > + > + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; > + ns = get_kvmclock_base_ns(); > + > + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); > + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); > + > + r = 0; > + break; > + } > + default: > + r = -ENXIO; > + } > + > + return r; > +} ... > static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, > struct kvm_enable_cap *cap) > { > @@ -5382,6 +5519,36 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > r = __set_sregs2(vcpu, u.sregs2); > break; > } > + case KVM_HAS_DEVICE_ATTR: { > + struct kvm_device_attr attr; > + > + r = -EFAULT; > + if (copy_from_user(&attr, argp, sizeof(attr))) > + goto out; > + > + r = kvm_vcpu_ioctl_has_device_attr(vcpu, &attr); > + break; > + } > + case KVM_GET_DEVICE_ATTR: { > + struct kvm_device_attr attr; > + > + r = -EFAULT; > + if (copy_from_user(&attr, argp, sizeof(attr))) > + goto out; > + > + r = kvm_vcpu_ioctl_get_device_attr(vcpu, &attr); > + break; > + } > + case KVM_SET_DEVICE_ATTR: { > + struct kvm_device_attr attr; > + > + r = -EFAULT; > + if (copy_from_user(&attr, argp, sizeof(attr))) > + goto out; > + > + r = kvm_vcpu_ioctl_set_device_attr(vcpu, &attr); > + break; That's a lot of copy paste code, and I omitted a big chunk, too. What about something like: static int kvm_vcpu_ioctl_do_device_attr(struct kvm_vcpu *vcpu, unsigned int ioctl, void __user *argp) { struct kvm_device_attr attr; int r; if (copy_from_user(&attr, argp, sizeof(attr))) return -EFAULT; if (attr->group != KVM_VCPU_TSC_CTRL) return -ENXIO; switch (ioctl) { case KVM_HAS_DEVICE_ATTR: r = kvm_arch_tsc_has_attr(vcpu, attr); break; case KVM_GET_DEVICE_ATTR: r = kvm_arch_tsc_get_attr(vcpu, attr); break; case KVM_SET_DEVICE_ATTR: r = kvm_arch_tsc_set_attr(vcpu, attr); break; default: BUG(); } return r; } and then: case KVM_GET_DEVICE_ATTR: case KVM_HAS_DEVICE_ATTR: case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_do_device_attr(vcpu, ioctl, argp); break; or if we want to plan for the future a bit more: static int kvm_vcpu_ioctl_do_device_attr(struct kvm_vcpu *vcpu, unsigned int ioctl, void __user *argp) { struct kvm_device_attr attr; int r; if (copy_from_user(&attr, argp, sizeof(attr))) return -EFAULT; r = -ENXIO; switch (ioctl) { case KVM_HAS_DEVICE_ATTR: if (attr->group != KVM_VCPU_TSC_CTRL) r = kvm_arch_tsc_has_attr(vcpu, attr); break; case KVM_GET_DEVICE_ATTR: if (attr->group != KVM_VCPU_TSC_CTRL) r = kvm_arch_tsc_get_attr(vcpu, attr); break; case KVM_SET_DEVICE_ATTR: if (attr->group != KVM_VCPU_TSC_CTRL) r = kvm_arch_tsc_set_attr(vcpu, attr); break; } return r; } > + } > default: > r = -EINVAL; > } > -- > 2.32.0.432.gabb21c7263-goog >