Re: [PATCH 1/2] KVM: x86: Update vPMCs when retiring instructions

Like Xu <like.xu.linux@xxxxxxxxx> · Tue, 16 Nov 2021 20:43:52 +0800

Hi Jim,

On 13/11/2021 7:52 am, Jim Mattson wrote:
When KVM retires a guest instruction through emulation, increment any
vPMCs that are configured to monitor "instructions retired," and
update the sample period of those counters so that they will overflow
at the right time.

Signed-off-by: Eric Hankland <ehankland@xxxxxxxxxx>
[jmattson:
   - Split the code to increment "branch instructions retired" into a
     separate commit.
   - Added 'static' to kvm_pmu_incr_counter() definition.
   - Modified kvm_pmu_incr_counter() to check pmc->perf_event->state ==
     PERF_EVENT_STATE_ACTIVE.
]
Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx>
Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")
---
  arch/x86/kvm/pmu.c | 31 +++++++++++++++++++++++++++++++
  arch/x86/kvm/pmu.h |  1 +
  arch/x86/kvm/x86.c |  3 +++
  3 files changed, 35 insertions(+)

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 09873f6488f7..153c488032a5 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -490,6 +490,37 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
  	kvm_pmu_reset(vcpu);
  }
  
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc, u64 evt)
+{
+	u64 counter_value, sample_period;
+
+	if (pmc->perf_event &&

We need to incr pmc->counter whether it has a perf_event or not.

+	    pmc->perf_event->attr.type == PERF_TYPE_HARDWARE &&

We need to cover PERF_TYPE_RAW as well, for example,
it has the basic bits for "{ 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },"
plus HSW_IN_TX or ARCH_PERFMON_EVENTSEL_EDGE stuff.

We just need to focus on checking the select and umask bits:

static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
	unsigned int perf_hw_id)
{
	u64 old_eventsel = pmc->eventsel;
	unsigned int config;

	pmc->eventsel &=
		(ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
	config = kvm_x86_ops.pmu_ops->find_perf_hw_id(pmc);
	pmc->eventsel = old_eventsel;
	return config == perf_hw_id;
}

+	    pmc->perf_event->state == PERF_EVENT_STATE_ACTIVE &&

Again, we should not care the pmc->perf_event.

+	    pmc->perf_event->attr.config == evt) {

So how about the emulated instructions for
ARCH_PERFMON_EVENTSEL_USR and ARCH_PERFMON_EVENTSEL_USR ?

+		pmc->counter++;
+		counter_value = pmc_read_counter(pmc);
+		sample_period = get_sample_period(pmc, counter_value);
+		if (!counter_value)
+			perf_event_overflow(pmc->perf_event, NULL, NULL);

We need to call kvm_perf_overflow() or kvm_perf_overflow_intr().
And the patch set doesn't export the perf_event_overflow() SYMBOL.

+		if (local64_read(&pmc->perf_event->hw.period_left) >
+		    sample_period)
+			perf_event_period(pmc->perf_event, sample_period);
+	}
+}

Not cc PeterZ or perf reviewers for this part of code is not a good thing.

How about this:

static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
	struct kvm_pmu *pmu = pmc_to_pmu(pmc);

	pmc->counter++;
	reprogram_counter(pmu, pmc->idx);
	if (!pmc_read_counter(pmc))
		// https://lore.kernel.org/kvm/20211116122030.4698-1-likexu@xxxxxxxxxxx/T/#t
		kvm_pmu_counter_overflow(pmc, need_overflow_intr(pmc));
}

+
+void kvm_pmu_record_event(struct kvm_vcpu *vcpu, u64 evt)

s/kvm_pmu_record_event/kvm_pmu_trigger_event/

+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++)
+		kvm_pmu_incr_counter(&pmu->gp_counters[i], evt);

Why do we need to accumulate a counter that is not enabled at all ?

+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+		kvm_pmu_incr_counter(&pmu->fixed_counters[i], evt);

How about this:

	for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
		pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);

		if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
			continue;

		// https://lore.kernel.org/kvm/20211116122030.4698-1-likexu@xxxxxxxxxxx/T/#t
		if (eventsel_match_perf_hw_id(pmc, perf_hw_id))
			kvm_pmu_incr_counter(pmc);
	}

+}
+EXPORT_SYMBOL_GPL(kvm_pmu_record_event);
+
  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
  {
  	struct kvm_pmu_event_filter tmp, *filter;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 59d6b76203d5..d1dd2294f8fb 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -159,6 +159,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
  void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
  void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_record_event(struct kvm_vcpu *vcpu, u64 evt);
  
  bool is_vmware_backdoor_pmc(u32 pmc_idx);
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7def720227d..bd49e2a204d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7854,6 +7854,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
  	if (unlikely(!r))
  		return 0;
  
+	kvm_pmu_record_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
  	/*
  	 * rflags is the old, "raw" value of the flags.  The new value has
  	 * not been saved yet.
@@ -8101,6 +8103,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
  		if (!ctxt->have_exception ||
  		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+			kvm_pmu_record_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
  			kvm_rip_write(vcpu, ctxt->eip);
  			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
  				r = kvm_vcpu_do_singlestep(vcpu);