[RFC PATCH v3] ptp: Add vDSO-style vmclock support

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Sat, 06 Jul 2024 16:14:39 +0100

From: David Woodhouse <dwmw@xxxxxxxxxxxx>

The vmclock "device" provides a shared memory region with precision clock
information. By using shared memory, it is safe across Live Migration.

Like the KVM PTP clock, this can convert TSC-based cross timestamps into
KVM clock values. Unlike the KVM PTP clock, it does so only when such is
actually helpful.

The memory region of the device is also exposed to userspace so it can be
read or memory mapped by application which need reliable notification of
clock disruptions.

The shared memory structure is intended to be adopted into the nascent
virtio-rtc specification (since one might consider a virtio-rtc
specification that doesn't fix the live migration problem to be not fit
for purpose). It can also be presented via a simple ACPI device.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
QEMU implementation at 
https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/vmclock

Remaining questions/TODO for virtio-rtc adoption:
 • Use of signed integer for tai_offset field
 • Explicit little-endianness
 • Is picoseconds the right unit for absolute error (I was going to make
   this (seconds>>64) but that actually reduces the *range* that can be
   expressed).
 • Are the clock_status values sensible?

v3:
 • Add esterror fields
 • Reduce seq_count to 32 bits
 • Expand size to permit 64KiB pages
 • Align with virtio-rtc fields, values and leap handling
 • Drop gettime() method (since we have gettimex())
 • Add leap second smearing hint
 • Use a real _CRS on the ACPI device

v2: 
 • Add gettimex64() support
 • Convert TSC values to KVM clock when appropriate
 • Require int128 support
 • Add counter_period_shift
 • Add timeout when seq_count is invalid
 • Add flags field
 • Better comments in vmclock ABI structure
 • Explicitly forbid smearing (as clock rates would need to change)

 drivers/ptp/Kconfig          |  13 +
 drivers/ptp/Makefile         |   1 +
 drivers/ptp/ptp_vmclock.c    | 516 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/vmclock.h | 138 ++++++++++
 4 files changed, 668 insertions(+)
 create mode 100644 drivers/ptp/ptp_vmclock.c
 create mode 100644 include/uapi/linux/vmclock.h

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 604541dcb320..e98c9767e0ef 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_kvm.
 
+config PTP_1588_CLOCK_VMCLOCK
+	tristate "Virtual machine PTP clock"
+	depends on X86_TSC || ARM_ARCH_TIMER
+	depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+	default y
+	help
+	  This driver adds support for using a virtual precision clock
+	  advertised by the hypervisor. This clock is only useful in virtual
+	  machines where such a device is present.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called ptp_vmclock.
+
 config PTP_1588_CLOCK_IDT82P33
 	tristate "IDT 82P33xxx PTP clock"
 	depends on PTP_1588_CLOCK && I2C
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 68bf02078053..01b5cd91eb61 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
 obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
 obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
+obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK)	+= ptp_vmclock.o
 obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)	+= ptp-qoriq.o
 ptp-qoriq-y				+= ptp_qoriq.o
 ptp-qoriq-$(CONFIG_DEBUG_FS)		+= ptp_qoriq_debugfs.o
diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
new file mode 100644
index 000000000000..e19c2eed8009
--- /dev/null
+++ b/drivers/ptp/ptp_vmclock.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with LM-safe VMclock device.
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/acpi.h>
+#include <uapi/linux/vmclock.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+#ifdef CONFIG_X86
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#endif
+
+static DEFINE_IDA(vmclock_ida);
+
+ACPI_MODULE_NAME("vmclock");
+
+struct vmclock_state {
+	phys_addr_t phys_addr;
+	struct vmclock_abi *clk;
+	struct miscdevice miscdev;
+	struct ptp_clock_info ptp_clock_info;
+	struct ptp_clock *ptp_clock;
+	enum clocksource_ids cs_id, sys_cs_id;
+	int index;
+	char *name;
+};
+
+#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
+
+/*
+ * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
+ * and add the fractional second part of the reference time.
+ *
+ * The result is a 128-bit value, the top 64 bits of which are seconds, and
+ * the low 64 bits are (seconds >> 64).
+ *
+ * If __int128 isn't available, perform the calculation 32 bits at a time to
+ * avoid overflow.
+ */
+static inline uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
+					       uint64_t period, uint8_t shift,
+					       uint64_t frac_sec)
+{
+	unsigned __int128 res = (unsigned __int128)delta * period;
+
+	res >>= shift;
+	res += frac_sec;
+	*res_hi = res >> 64;
+	return (uint64_t)res;
+}
+
+static int vmclock_get_crosststamp(struct vmclock_state *st,
+				   struct ptp_system_timestamp *sts,
+				   struct system_counterval_t *system_counter,
+				   struct timespec64 *tspec)
+{
+	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	struct system_time_snapshot systime_snapshot;
+	uint64_t cycle, delta, seq, frac_sec;
+
+#ifdef CONFIG_X86
+	/*
+	 * We'd expect the hypervisor to know this and to report the clock
+	 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
+	 */
+	if (check_tsc_unstable())
+		return -EINVAL;
+#endif
+
+	while (1) {
+		seq = st->clk->seq_count & ~1ULL;
+		virt_rmb();
+
+		if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
+			return -EINVAL;
+
+		/*
+		 * When invoked for gettimex64(), fill in the pre/post system
+		 * times. The simple case is when system time is based on the
+		 * same counter as st->cs_id, in which case all three times
+		 * will be derived from the *same* counter value.
+		 *
+		 * If the system isn't using the same counter, then the value
+		 * from ktime_get_snapshot() will still be used as pre_ts, and
+		 * ptp_read_system_postts() is called to populate postts after
+		 * calling get_cycles().
+		 *
+		 * The conversion to timespec64 happens further down, outside
+		 * the seq_count loop.
+		 */
+		if (sts) {
+			ktime_get_snapshot(&systime_snapshot);
+			if (systime_snapshot.cs_id == st->cs_id) {
+				cycle = systime_snapshot.cycles;
+			} else {
+				cycle = get_cycles();
+				ptp_read_system_postts(sts);
+			}
+		} else
+			cycle = get_cycles();
+
+		delta = cycle - st->clk->counter_value;
+
+		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
+						   st->clk->counter_period_frac_sec,
+						   st->clk->counter_period_shift,
+						   st->clk->utc_time_frac_sec);
+		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
+		tspec->tv_sec += st->clk->utc_time_sec;
+
+		virt_rmb();
+		if (seq == st->clk->seq_count)
+			break;
+
+		if (ktime_after(ktime_get(), deadline))
+			return -ETIMEDOUT;
+	}
+
+	if (system_counter) {
+		system_counter->cycles = cycle;
+		system_counter->cs_id = st->cs_id;
+	}
+
+	if (sts) {
+		sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
+		if (systime_snapshot.cs_id == st->cs_id)
+			sts->post_ts = sts->pre_ts;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86
+/*
+ * In the case where the system is using the KVM clock for timekeeping, convert
+ * the TSC value into a KVM clock time in order to return a paired reading that
+ * get_device_system_crosststamp() can cope with.
+ */
+static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
+					    struct ptp_system_timestamp *sts,
+					    struct system_counterval_t *system_counter,
+					    struct timespec64 *tspec)
+{
+	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
+	unsigned pvti_ver;
+	int ret;
+
+	preempt_disable_notrace();
+
+	do {
+		pvti_ver = pvclock_read_begin(pvti);
+
+		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
+		if (ret)
+			break;
+
+		system_counter->cycles = __pvclock_read_cycles(pvti,
+							       system_counter->cycles);
+		system_counter->cs_id = CSID_X86_KVM_CLK;
+
+		/*
+		 * This retry should never really happen; if the TSC is
+		 * stable and reliable enough across vCPUS that it is sane
+		 * for the hypervisor to expose a VMCLOCK device which uses
+		 * it as the reference counter, then the KVM clock sohuld be
+		 * in 'master clock mode' and basically never changed. But
+		 * the KVM clock is a fickle and often broken thing, so do
+		 * it "properly" just in case.
+		 */
+	} while (pvclock_read_retry(pvti, pvti_ver));
+
+	preempt_enable_notrace();
+
+	return ret;
+}
+#endif
+
+static int ptp_vmclock_get_time_fn(ktime_t *device_time,
+				   struct system_counterval_t *system_counter,
+				   void *ctx)
+{
+	struct vmclock_state *st = ctx;
+	struct timespec64 tspec;
+	int ret;
+
+#ifdef CONFIG_X86
+	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
+		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
+						       &tspec);
+	else
+#endif
+		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
+
+	if (!ret)
+		*device_time = timespec64_to_ktime(tspec);
+
+	return ret;
+}
+
+
+static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
+				      struct system_device_crosststamp *xtstamp)
+{
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+	int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
+						NULL, xtstamp);
+#ifdef CONFIG_X86
+	/*
+	 * On x86, the KVM clock may be used for the system time. We can
+	 * actually convert a TSC reading to that, and return a paired
+	 * timestamp that get_device_system_crosststamp() *can* handle.
+	 */
+	if (ret == -ENODEV) {
+		struct system_time_snapshot systime_snapshot;
+		ktime_get_snapshot(&systime_snapshot);
+
+		if (systime_snapshot.cs_id == CSID_X86_TSC ||
+		    systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
+			WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
+			ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
+							    st, NULL, xtstamp);
+		}
+	}
+#endif
+	return ret;
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+
+	return vmclock_get_crosststamp(st, NULL, NULL, ts);
+}
+
+static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
+				struct ptp_system_timestamp *sts)
+{
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+
+	return vmclock_get_crosststamp(st, sts, NULL, ts);
+}
+
+static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_vmclock_info = {
+	.owner		= THIS_MODULE,
+	.max_adj	= 0,
+	.n_ext_ts	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfine	= ptp_vmclock_adjfine,
+	.adjtime	= ptp_vmclock_adjtime,
+	.gettime64	= ptp_vmclock_gettime,
+	.gettimex64	= ptp_vmclock_gettimex,
+	.settime64	= ptp_vmclock_settime,
+	.enable		= ptp_vmclock_enable,
+	.getcrosststamp = ptp_vmclock_getcrosststamp,
+};
+
+static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+
+	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
+		return -EROFS;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
+		return -EINVAL;
+
+        if (io_remap_pfn_range(vma, vma->vm_start,
+			       st->phys_addr >> PAGE_SHIFT, PAGE_SIZE,
+                               vma->vm_page_prot))
+                return -EAGAIN;
+
+        return 0;
+}
+
+static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	size_t max_count;
+	int32_t seq;
+
+	if (*ppos >= PAGE_SIZE)
+		return 0;
+
+	max_count = PAGE_SIZE - *ppos;
+	if (count > max_count)
+		count = max_count;
+
+	while (1) {
+		seq = st->clk->seq_count & ~1ULL;
+		virt_rmb();
+
+		if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
+			return -EFAULT;
+
+		virt_rmb();
+		if (seq == st->clk->seq_count)
+			break;
+
+		if (ktime_after(ktime_get(), deadline))
+			return -ETIMEDOUT;
+	}
+
+	*ppos += count;
+	return count;
+}
+
+static const struct file_operations vmclock_miscdev_fops = {
+        .mmap = vmclock_miscdev_mmap,
+        .read = vmclock_miscdev_read,
+};
+
+/* module operations */
+
+static void vmclock_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct vmclock_state *st = dev_get_drvdata(dev);
+
+	if (st->ptp_clock)
+		ptp_clock_unregister(st->ptp_clock);
+
+	if (st->miscdev.minor == MISC_DYNAMIC_MINOR)
+		misc_deregister(&st->miscdev);
+}
+
+static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
+{
+	struct acpi_buffer parsed = { ACPI_ALLOCATE_BUFFER };
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	union acpi_object *obj;
+	acpi_status status;
+
+	status = acpi_evaluate_object(adev->handle, "ADDR", NULL, &parsed);
+	if (ACPI_FAILURE(status)) {
+		ACPI_EXCEPTION((AE_INFO, status, "Evaluating ADDR"));
+		return -ENODEV;
+	}
+	obj = parsed.pointer;
+	if (!obj || obj->type != ACPI_TYPE_PACKAGE || obj->package.count != 2 ||
+	    obj->package.elements[0].type != ACPI_TYPE_INTEGER ||
+	    obj->package.elements[1].type != ACPI_TYPE_INTEGER)
+		return -EINVAL;
+
+	st->phys_addr = (obj->package.elements[0].integer.value << 0) |
+		(obj->package.elements[1].integer.value << 32);
+
+	return 0;
+}
+
+static void vmclock_put_idx(void *data)
+{
+	struct vmclock_state *st = data;
+
+	ida_free(&vmclock_ida, st->index);
+}
+
+static int vmclock_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct vmclock_state *st;
+	int ret;
+
+	st = devm_kzalloc(dev, sizeof (*st), GFP_KERNEL);
+	if (!st)
+		return -ENOMEM;
+
+	if (has_acpi_companion(dev))
+		ret = vmclock_probe_acpi(dev, st);
+	else
+		ret = -EINVAL; /* Only ACPI for now */
+
+	if (ret) {
+		dev_info(dev, "Failed to obtain physical address: %d\n", ret);
+		goto out;
+	}
+
+	st->clk = devm_memremap(dev, st->phys_addr, sizeof(*st->clk),
+				MEMREMAP_WB);
+	if (IS_ERR(st->clk)) {
+		ret = PTR_ERR(st->clk);
+		dev_info(dev, "failed to map shared memory\n");
+		st->clk = NULL;
+		goto out;
+	}
+
+	if (st->clk->magic != VMCLOCK_MAGIC ||
+	    st->clk->size < sizeof(*st->clk) ||
+	    st->clk->version != 1) {
+		dev_info(dev, "vmclock magic fields invalid\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (IS_ENABLED(CONFIG_ARM64) &&
+	    st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
+		/* Can we check it's the virtual counter? */
+		st->cs_id = CSID_ARM_ARCH_COUNTER;
+	} else if (IS_ENABLED(CONFIG_X86) &&
+		   st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
+		st->cs_id = CSID_X86_TSC;
+	}
+	st->sys_cs_id = st->cs_id;
+
+	ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	st->index = ret;
+        ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
+	if (ret)
+		goto out;
+
+	st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
+	if (!st->name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* If the structure is big enough, it can be mapped to userspace */
+	if (st->clk->size >= PAGE_SIZE) {
+		st->miscdev.minor = MISC_DYNAMIC_MINOR;
+		st->miscdev.fops = &vmclock_miscdev_fops;
+		st->miscdev.name = st->name;
+
+		ret = misc_register(&st->miscdev);
+		if (ret)
+			goto out;
+	}
+
+	/* If there is valid clock information, register a PTP clock */
+	if (st->cs_id) {
+		st->ptp_clock_info = ptp_vmclock_info;
+		strncpy(st->ptp_clock_info.name, st->name, sizeof(st->ptp_clock_info.name));
+		st->ptp_clock = ptp_clock_register(&st->ptp_clock_info, dev);
+
+		if (IS_ERR(st->ptp_clock)) {
+			ret = PTR_ERR(st->ptp_clock);
+			st->ptp_clock = NULL;
+			vmclock_remove(pdev);
+			goto out;
+		}
+	}
+
+	dev_set_drvdata(dev, st);
+
+ out:
+	return ret;
+}
+
+static const struct acpi_device_id vmclock_acpi_ids[] = {
+	{ "VMCLOCK", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
+
+static struct platform_driver vmclock_platform_driver = {
+	.probe		= vmclock_probe,
+	.remove_new	= vmclock_remove,
+	.driver	= {
+		.name	= "vmclock",
+		.acpi_match_table = vmclock_acpi_ids,
+	},
+};
+
+module_platform_driver(vmclock_platform_driver)
+
+MODULE_AUTHOR("David Woodhouse <dwmw2@xxxxxxxxxxxxx>");
+MODULE_DESCRIPTION("PTP clock using VMCLOCK");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/vmclock.h b/include/uapi/linux/vmclock.h
new file mode 100644
index 000000000000..cf0f22205e79
--- /dev/null
+++ b/include/uapi/linux/vmclock.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+
+/*
+ * This structure provides a vDSO-style clock to VM guests, exposing the
+ * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
+ * counter, etc.) and real time. It is designed to address the problem of
+ * live migration, which other clock enlightenments do not.
+ *
+ * When a guest is live migrated, this affects the clock in two ways.
+ *
+ * First, even between identical hosts the actual frequency of the underlying
+ * counter will change within the tolerances of its specification (typically
+ * ±50PPM, or 4 seconds a day). The frequency also varies over time on the
+ * same host, but can be tracked by NTP as it generally varies slowly. With
+ * live migration there is a step change in the frequency, with no warning.
+ *
+ * Second, there may be a step change in the value of the counter itself, as
+ * its accuracy is limited by the precision of the NTP synchronization on the
+ * source and destination hosts.
+ *
+ * So any calibration (NTP, PTP, etc.) which the guest has done on the source
+ * host before migration is invalid, and needs to be redone on the new host.
+ *
+ * In its most basic mode, this structure provides only an indication to the
+ * guest that live migration has occurred. This allows the guest to know that
+ * its clock is invalid and take remedial action. For applications that need
+ * reliable accurate timestamps (e.g. distributed databases), the structure
+ * can be mapped all the way to userspace. This allows the application to see
+ * directly for itself that the clock is disrupted and take appropriate
+ * action, even when using a vDSO-style method to get the time instead of a
+ * system call.
+ *
+ * In its more advanced mode. this structure can also be used to expose the
+ * precise relationship of the CPU counter to real time, as calibrated by the
+ * host. This means that userspace applications can have accurate time
+ * immediately after live migration, rather than having to pause operations
+ * and wait for NTP to recover. This mode does, of course, rely on the
+ * counter being reliable and consistent across CPUs.
+ *
+ * Note that this must be true UTC, never with smeared leap seconds. If a
+ * guest wishes to construct a smeared clock, it can do so. Presenting a
+ * smeared clock through this interface would be problematic because it
+ * actually messes with the apparent counter *period*. A linear smearing
+ * of 1 ms per second would effectively tweak the counter period by 1000PPM
+ * at the start/end of the smearing period, while a sinusoidal smear would
+ * basically be impossible to represent.
+ */
+
+#ifndef __VMCLOCK_H__
+#define __VMCLOCK_H__
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+struct vmclock_abi {
+	uint32_t magic;
+#define VMCLOCK_MAGIC	0x4b4c4356 /* "VCLK" */
+	uint16_t size;		/* Size of page containing this structure */
+	uint16_t version;	/* 1 */
+
+	/* Sequence lock. Low bit means an update is in progress. */
+	uint64_t seq_count;
+
+	/*
+	 * This field changes to another non-repeating value when the CPU
+	 * counter is disrupted, for example on live migration.
+	 */
+	uint64_t disruption_marker;
+
+	/*
+	 * By providing the TAI offset, the guest can know both UTC and TAI
+	 * reliably. There is no need to choose one *or* the other. Valid if
+	 * VMCLOCK_FLAG_TAI_OFFSET_VALID is set in flags.
+	 */
+	int16_t tai_offset_sec;
+
+	uint16_t flags;
+	/* Indicates that the tai_offset_sec field is valid */
+#define VMCLOCK_FLAG_TAI_OFFSET_VALID		(1 << 0)
+	/*
+	 * Optionally used to notify guests of pending maintenance events.
+	 * A guest may wish to remove itself from service if an event is
+	 * coming up. Two flags indicate the rough imminence of the event.
+	 */
+#define VMCLOCK_FLAG_DISRUPTION_SOON		(1 << 1) /* About a day */
+#define VMCLOCK_FLAG_DISRUPTION_IMMINENT	(1 << 2) /* About an hour */
+	/* Indicates that the utc_time_maxerror_picosec field is valid */
+#define VMCLOCK_FLAG_UTC_MAXERROR_VALID		(1 << 3)
+	/* Indicates counter_period_error_rate_frac_sec is valid */
+#define VMCLOCK_FLAG_UTC_PERIOD_ERROR_VALID	(1 << 4)
+
+	uint8_t clock_status;
+#define VMCLOCK_STATUS_UNKNOWN		0
+#define VMCLOCK_STATUS_INITIALIZING	1
+#define VMCLOCK_STATUS_SYNCHRONIZED	2
+#define VMCLOCK_STATUS_FREERUNNING	3
+#define VMCLOCK_STATUS_UNRELIABLE	4
+
+	uint8_t counter_id;
+#define VMCLOCK_COUNTER_INVALID		0
+#define VMCLOCK_COUNTER_X86_TSC		1
+#define VMCLOCK_COUNTER_ARM_VCNT	2
+
+	/* Bit shift for counter_period_frac_sec and its error rate */
+	uint8_t counter_period_shift;
+
+	/*
+	 * Unlike in NTP, this can indicate a leap second in the past. This
+	 * is needed to allow guests to derive an imprecise clock with
+	 * smeared leap seconds for themselves, as some modes of smearing
+	 * need the adjustments to continue even after the moment at which
+	 * the leap second should have occurred.
+	 */
+	int8_t leapsecond_direction;
+	uint64_t leapsecond_tai_sec; /* Since 1970-01-01 00:00:00z */
+
+	/*
+	 * Paired values of counter and UTC at a given point in time.
+	 */
+	uint64_t counter_value;
+	uint64_t utc_time_sec; /* Since 1970-01-01 00:00:00z */
+	uint64_t utc_time_frac_sec;
+
+	/*
+	 * Counter frequency, and error margin. The unit of these fields is
+	 * seconds >> (64 + counter_period_shift)
+	 */
+	uint64_t counter_period_frac_sec;
+	uint64_t counter_period_error_rate_frac_sec;
+
+	/* Error margin of UTC reading above (± picoseconds) */
+	uint64_t utc_time_maxerror_picosec;
+};
+
+#endif /*  __VMCLOCK_H__ */
-- 
2.44.0


<<attachment: smime.p7s>>