Implement the save/restore of PMU state for pasthrough PMU in Intel. In passthrough mode, KVM owns exclusively the PMU HW when control flow goes to the scope of passthrough PMU. Thus, KVM needs to save the host PMU state and gains the full HW PMU ownership. On the contrary, host regains the ownership of PMU HW from KVM when control flow leaves the scope of passthrough PMU. Implement PMU context switches for Intel CPUs and opptunistically use rdpmcl() instead of rdmsrl() when reading counters since the former has lower latency in Intel CPUs. Co-developed-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx> Tested-by: Yongwei Ma <yongwei.ma@xxxxxxxxx> --- arch/x86/kvm/pmu.c | 46 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 41 +++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 782b564bdf96..9bb733384069 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1068,14 +1068,60 @@ void kvm_pmu_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) void kvm_pmu_save_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + lockdep_assert_irqs_disabled(); static_call_cond(kvm_x86_pmu_save_pmu_context)(vcpu); + + /* + * Clear hardware selector MSR content and its counter to avoid + * leakage and also avoid this guest GP counter get accidentally + * enabled during host running when host enable global ctrl. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + rdpmcl(i, pmc->counter); + rdmsrl(pmc->msr_eventsel, pmc->eventsel); + if (pmc->counter) + wrmsrl(pmc->msr_counter, 0); + if (pmc->eventsel) + wrmsrl(pmc->msr_eventsel, 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); + if (pmc->counter) + wrmsrl(pmc->msr_counter, 0); + } } void kvm_pmu_restore_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + lockdep_assert_irqs_disabled(); static_call_cond(kvm_x86_pmu_restore_pmu_context)(vcpu); + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * in this case will be intercepted. Accessing to these counters and + * selectors will cause #GP in the guest. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + wrmsrl(pmc->msr_counter, pmc->counter); + wrmsrl(pmc->msr_eventsel, pmu->gp_counters[i].eventsel); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + wrmsrl(pmc->msr_counter, pmc->counter); + } } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0de918dc14ea..89c8f73a48c8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -572,7 +572,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_counters[i].msr_eventsel = MSR_CORE_PERF_FIXED_CTR_CTRL; + pmu->fixed_counters[i].msr_eventsel = 0; pmu->fixed_counters[i].msr_counter = MSR_CORE_PERF_FIXED_CTR0 + i; } } @@ -799,6 +799,43 @@ static void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_TYPE_RW, msr_intercept); } +static void intel_save_guest_pmu_context(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* Global ctrl register is already saved at VM-exit. */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also avoid these guest fixed counters get accidentially enabled + * during host running when host enable global ctrl. + */ + if (pmu->fixed_ctr_ctrl) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + +static void intel_restore_guest_pmu_context(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + /* Clear host global_ctrl MSR if non-zero. */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -812,6 +849,8 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_rdpmc_passthru_allowed = intel_is_rdpmc_passthru_allowed, .passthrough_pmu_msrs = intel_passthrough_pmu_msrs, + .save_pmu_context = intel_save_guest_pmu_context, + .restore_pmu_context = intel_restore_guest_pmu_context, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, .MIN_NR_GP_COUNTERS = 1, -- 2.46.0.rc1.232.g9752f9e123-goog