From: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx> PT virtualization can be work in one of 4 possible modes: a. system-wide: trace both host/guest and output to host buffer; b. host-only: only trace host and output to host buffer; c. guest-only: only trace guest and output to guest buffer; d. host-guest: trace host/guest simultaneous and output to their respective buffer. Signed-off-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx> Signed-off-by: Luwei Kang <luwei.kang@xxxxxxxxx> --- arch/x86/include/asm/intel_pt.h | 7 ++++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 6 +++ arch/x86/kvm/vmx.c | 88 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 98 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 73c8942..b5dd33c 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -1,6 +1,13 @@ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H +enum pt_mode { + PT_MODE_SYSTEM = 0, + PT_MODE_HOST, + PT_MODE_GUEST, + PT_MODE_HOST_GUEST, +}; + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 80b26e1..57433e4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -751,6 +751,7 @@ #define VMX_BASIC_INOUT 0x0040000000000000LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index caec841..80e3e22 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,7 +76,9 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_RDSEED 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000 #define SECONDARY_EXEC_XSAVES 0x00100000 +#define SECONDARY_EXEC_PT_USE_GPA 0x01000000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -97,6 +99,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 +#define VM_EXIT_PT_SUPPRESS_PIP 0x01000000 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -108,6 +112,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 +#define VM_ENTRY_PT_SUPPRESS_PIP 0x00020000 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 95a0160..f0cae7c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> #include <asm/mmu_context.h> +#include <asm/intel_pt.h> #include "trace.h" #include "pmu.h" @@ -178,6 +179,10 @@ static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, int, S_IRUGO); +/* Default is host guest mode. */ +static int __read_mostly pt_mode = PT_MODE_HOST_GUEST; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 @@ -1321,6 +1326,19 @@ static inline bool cpu_has_vmx_vmfunc(void) SECONDARY_EXEC_ENABLE_VMFUNC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT; +} + +static inline bool cpu_has_vmx_pt_use_gpa(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -3661,6 +3679,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_PT_USE_GPA | + SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, @@ -3694,7 +3714,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_PT_SUPPRESS_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -3713,11 +3734,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_PT_SUPPRESS_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; + /* + * If one of them is set to 1, all must be set to 1. This ensures that + * Intel PT output will not switch between using GPAs and PPAs for + * output without first being disabled. + */ + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_PT_USE_GPA) || + !(_vmexit_control & VM_EXIT_CLEAR_IA32_RTIT_CTL) || + !(_vmentry_control & VM_ENTRY_LOAD_IA32_RTIT_CTL)) { + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_PT_USE_GPA; + _vmexit_control &= ~VM_EXIT_CLEAR_IA32_RTIT_CTL; + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_RTIT_CTL; + } + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ @@ -5279,6 +5314,38 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u32 vmx_vmexit_control(struct vcpu_vmx *vmx) +{ + u32 vmexit_control = vmcs_config.vmexit_ctrl; + + /* + * Enable VMX-specific packet information and disable VMCS + * controls for IA32_RTIT_CTL MSR in system mode. + */ + if (pt_mode == PT_MODE_SYSTEM) + vmexit_control &= ~( + VM_EXIT_PT_SUPPRESS_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + + return vmexit_control; +} + +static u32 vmx_vmentry_control(struct vcpu_vmx *vmx) +{ + u32 vmentry_control = vmcs_config.vmentry_ctrl; + + /* + * Enable VMX-specific packet information and disable VMCS + * controls for IA32_RTIT_CTL MSR in system mode. + */ + if (pt_mode == PT_MODE_SYSTEM) + vmentry_control &= ~( + VM_ENTRY_PT_SUPPRESS_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + + return vmentry_control; +} + static bool vmx_rdrand_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -5409,6 +5476,15 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } + /* + * Enable VMX-specific packet information and disable VMCS + * controls for IA32_RTIT_CTL MSR in system mode. + */ + if (pt_mode == PT_MODE_SYSTEM) + exec_control &= ~( + SECONDARY_EXEC_PT_CONCEAL_VMX | + SECONDARY_EXEC_PT_USE_GPA); + vmx->secondary_exec_control = exec_control; } @@ -5521,10 +5597,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } - vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + vm_exit_controls_init(vmx, vmx_vmexit_control(vmx)); /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmx_vmentry_control(vmx)); vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); @@ -6865,6 +6941,10 @@ static __init int hardware_setup(void) kvm_mce_cap_supported |= MCG_LMCE_P; + if (!enable_ept || !pt_cap_get(PT_CAP_topa_output) || + !cpu_has_vmx_intel_pt() || !cpu_has_vmx_pt_use_gpa()) + pt_mode = PT_MODE_SYSTEM; + return alloc_kvm_area(); out: -- 1.8.3.1