Re: [PATCH 2/9] KVM: Expose a version 2 architectural PMU to a guests

Gleb Natapov <gleb@xxxxxxxxxx> · Tue, 1 Nov 2011 14:30:41 +0200



On Tue, Nov 01, 2011 at 12:47:18PM +0200, Avi Kivity wrote:
> On 10/30/2011 06:53 PM, Gleb Natapov wrote:
> > From: Avi Kivity <avi@xxxxxxxxxx>
> 
> This has changed significantly, so please update the authorship.  You
> can say 'based on original patch by ...' to provide due credit.
> 
> > Use perf_events to emulate an architectural PMU, version 2.
> 
> > +
> > +/* mapping between fixed pmc index and arch_events array */
> > +int fixed_pmc_events[] = {1, 0, 2};
> > +
> > +static bool pmc_is_gp(struct kvm_pmc *pmc)
> > +{
> > +	return pmc->type == KVM_PMC_GP;
> > +}
> > +
> > +static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
> > +{
> > +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> > +
> > +	return pmc_is_gp(pmc) ? pmu->gp_counter_bitmask :
> > +		pmu->fixed_counter_bitmask;
> > +}
> 
> Nicer to just push the bitmask (or bitwidth) into the counter itself.
> 
Hmm, is it really nicer to replicate the same information 35 times?

> > +
> > +static inline int pmc_to_global_idx(struct kvm_pmc *pmc)
> > +{
> > +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> > +	struct kvm_pmc *counters;
> > +	int shift;
> > +
> > +	if (pmc_is_gp(pmc)) {
> > +		counters = pmu->gp_counters;
> > +		shift = X86_PMC_IDX_GENERIC;
> > +	} else {
> > +		counters = pmu->fixed_counters;
> > +		shift = X86_PMC_IDX_FIXED;
> > +	}
> > +
> > +	return pmc - counters + shift;
> > +}
> 
> Again, push the global index into struct kvm_pmc.
OK.

> 
> > +
> > +static void kvm_perf_overflow(struct perf_event *perf_event,
> > +			      struct perf_sample_data *data,
> > +			      struct pt_regs *regs)
> > +{
> > +	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> > +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> > +	__set_bit(pmc_to_global_idx(pmc),
> > +			(unsigned long *)&pmu->global_status);
> > +}
> > +
> > +static void kvm_perf_overflow_intr(struct perf_event *perf_event,
> > +		struct perf_sample_data *data, struct pt_regs *regs)
> > +{
> > +	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> > +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> > +	if (!__test_and_set_bit(pmc_to_global_idx(pmc),
> > +				(unsigned long *)&pmu->reprogram_pmi)) {
> > +		kvm_perf_overflow(perf_event, data, regs);
> > +		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> > +	}
> > +}
> 
> Is it safe to use the __ versions here?
>
It supposed to run in an NMI context on the same CPU that just ran
the vcpu so simultaneous access to the same variable from different
CPUs shouldn't be possible. But if your scenario below can happen then
that assumption may not hold. The question is if PMI delivery can be
so skewed as to be delivered long after vmexit (which switches perf msr
values btw).

> Do we need to follow kvm_make_request() with kvm_vcpu_kick()?  If there
> is a skew between the overflow and the host PMI, the guest might have
> executed a HLT.
Is kvm_vcpu_kick() safe for NMI context?

> 
> > +
> > +static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
> > +{
> > +	unsigned en = en_pmi & 0x3;
> > +	bool pmi = en_pmi & 0x8;
> > +
> > +	stop_counter(pmc);
> > +
> > +	if (!en || !pmc_enabled(pmc))
> > +		return;
> > +
> > +	reprogram_counter(pmc, PERF_TYPE_HARDWARE,
> > +			arch_events[fixed_pmc_events[idx]].event_type,
> > +			!(en & 0x2), /* exclude user */
> > +			!(en & 0x1), /* exclude kernel */
> > +			pmi);
> 
> Are there no #defines for those constants?
> 
Nope. perf_event_intel.c open codes them too.

> > +}
> > +
> > +#define FIXED_EN_PMI(R, I) (((R) >> ((I) * 4)) & 0xf)
> 
> function
> 
> > +	default:
> > +		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
> > +				(pmc = get_fixed_pmc(pmu, index))) {
> > +			data = (s64)(s32)data;
> > +			pmc->counter += data - read_pmc(pmc);
> > +			return 0;
> > +		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
> > +			if (data == pmc->eventsel)
> > +				return 0;
> > +			if (!(data & 0xffffffff00200000ull)) {
> > +				reprogram_gp_counter(pmc, data);
> > +				return 0;
> > +			}
> > +		}
> > +	}
> > +	return 1;
> > +}
> > +
> > +
> > +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> > +	struct kvm_cpuid_entry2 *entry;
> > +	unsigned bitmap_len;
> > +
> > +	pmu->nr_arch_gp_counters = 0;
> > +	pmu->nr_arch_fixed_counters = 0;
> > +	pmu->fixed_counter_bitmask = 0;
> > +	pmu->version = 0;
> > +
> > +	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
> > +	if (!entry)
> > +		return;
> > +
> > +	pmu->version = entry->eax & 0xff;
> > +	if (!pmu->version)
> > +		return;
> > +
> > +	pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
> > +			X86_PMC_MAX_GENERIC);
> > +	pmu->gp_counter_bitmask = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
> > +	bitmap_len = (entry->eax >> 24) & 0xff;
> > +	pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
> > +
> > +	if (pmu->version > 1) {
> > +		pmu->nr_arch_fixed_counters = min((int)(entry->edx) & 0x1f,
> > +				X86_PMC_MAX_FIXED);
> 
> Misplaced parentheses (though no effect on generated code).
> 
> > +		pmu->fixed_counter_bitmask =
> > +			((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
> 
> The user can cause this to be very small (even zero).  Can this cause an
> NMI storm?
> 
If user will set it to zero then attr.sample_period will always be 0 and
perf will think that the event is non sampling and will use max_period
instead. For a small value greater than zero how is it different from
userspace creating an event with sample_period of 1?

> > +		pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
> > +				| (((1ull << pmu->nr_arch_fixed_counters) - 1)
> > +					<< X86_PMC_IDX_FIXED));
> > +	} else
> > +		pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
> > +}
> 
> Nicer to just return early if version < 2; less indentation and easier
> to prepare for version 3.
> 
> -- 
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html