We need to retrieve a VM's TSC offset in order to use the host's TSC to merge host and guest traces. This is explained in detail in this thread: [Qemu-devel] [RFC] host and guest kernel trace merging https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg00887.html Today, the only way to retrieve a VM's TSC offset is by using the kvm_write_tsc_offset tracepoint. This has a few problems. First, the tracepoint is only emitted when the VM boots, which requires a reboot to get it if the VM is already running. Second, tracepoints are not supposed to be ABIs in case they need to be consumed by user-space tools. This commit exports a VM's TSC offset to user-space via debugfs. A new file called "tsc-offset" is created in the VM's debugfs directory. For example: /sys/kernel/debug/kvm/51696-10/tsc-offset This file contains one TSC offset per line, for each vCPU. For example: vcpu0: 18446742405270834952 vcpu1: 18446742405270834952 vcpu2: 18446742405270834952 vcpu3: 18446742405270834952 There are some important observations about this solution: - While all vCPUs TSC offsets should be equal for the cases we care about (ie. stable TSC and no write to the TSC MSR), I chose to follow the spec and export each vCPU's TSC offset (might also be helpful for debugging) - The TSC offset is only useful after the VM has booted - We'll probably need to export the TSC multiplier too. However, I've been using only the TSC offset for now. So, let's get this merged first and do the TSC multiplier as a second step Signed-off-by: Luiz Capitulino <lcapitulino@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx.c | 8 ++++++++ arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4..5714bbd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -952,6 +952,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); + u64 (*read_cached_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d8..c851477 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5065,6 +5065,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .read_tsc_offset = svm_read_tsc_offset, + .read_cached_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5cede40..82dfe42 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -616,6 +616,7 @@ struct vcpu_vmx { u64 hv_deadline_tsc; u64 current_tsc_ratio; + u64 cached_tsc_offset; bool guest_pkru_valid; u32 guest_pkru; @@ -2608,6 +2609,11 @@ static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) return vmcs_read64(TSC_OFFSET); } +static u64 vmx_read_cached_tsc_offset(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->cached_tsc_offset; +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2632,6 +2638,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcs_read64(TSC_OFFSET), offset); vmcs_write64(TSC_OFFSET, offset); } + to_vmx(vcpu)->cached_tsc_offset = offset; } static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) @@ -11275,6 +11282,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .read_tsc_offset = vmx_read_tsc_offset, + .read_cached_tsc_offset = vmx_read_cached_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 18dfbac..75a8e23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -54,6 +54,7 @@ #include <linux/pvclock_gtod.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> +#include <linux/debugfs.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> @@ -7779,8 +7780,37 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } +static int tsc_offset_show(struct seq_file *m, void *data) +{ + struct kvm *kvm = m->private; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + seq_printf(m, "vcpu%d: %llu\n", + vcpu->vcpu_id, kvm_x86_ops->read_cached_tsc_offset(vcpu)); + + return 0; +} + +static int tsc_offset_open(struct inode *inode, struct file *file) +{ + return single_open(file, tsc_offset_show, inode->i_private); +} + +static const struct file_operations tsc_offset_fops = { + .owner = THIS_MODULE, + .open = tsc_offset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + int kvm_arch_create_vm_debugfs(struct kvm *kvm) { + if (!debugfs_create_file("tsc-offset", 0444, + kvm->debugfs_dentry, kvm, &tsc_offset_fops)) + return -ENOMEM; return 0; } -- 2.5.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html