On Tue, Feb 14, 2023, Like Xu wrote: > + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: > + if (!msr_info->host_initiated) > + return 0; /* Writes are ignored */ Where is the "writes ignored" behavior documented? I can't find anything in the APM that defines write behavior. > > pmu->global_status = data; > return 0; > case MSR_CORE_PERF_GLOBAL_CTRL: > if (!kvm_valid_perf_global_ctrl(pmu, data)) > return 1; > - > + fallthrough; This _definitely_ needs a comment. Hmm, and I would prefer to reverse these, i.e. case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: data &= ~pmu->global_ctrl_mask; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) return 1; It's a bit arbitrary, but either Intel or AMD is going to end up with extra code, and IMO skipping a validity check is more alarming than skipping clearing of reserved bits, i.e. will look like a bug to future readers. > + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: > + data &= ~pmu->global_ctrl_mask; > if (pmu->global_ctrl != data) { > diff = pmu->global_ctrl ^ data; > pmu->global_ctrl = data; > @@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_CORE_PERF_GLOBAL_OVF_CTRL: > if (data & pmu->global_ovf_ctrl_mask) > return 1; > - > + fallthrough; Here too. Argh, the APM doesn't actually define what happens on reserved bits, it just says "WO". I vote to be conservative and ignore writes to reserved bits. And then we can have one comment for the whole block, e.g. /* * Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel * generates #GP on attempts to write reserved bits or RO MSRs. */ switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: if (!msr_info->host_initiated) return 1; /* RO MSR */ fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: if (!msr_info->host_initiated) break; pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: data &= ~pmu->global_ctrl_mask; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) return 1; if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); } break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: fallthrough; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: if (data & pmu->global_ovf_ctrl_mask) return 1; if (!msr_info->host_initiated) pmu->global_status &= ~data; break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } return 0; > @@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > static void amd_pmu_refresh(struct kvm_vcpu *vcpu) > { > struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_cpuid_entry2 *entry; > + union cpuid_0x80000022_ebx ebx; > > - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) > + pmu->version = 1; > + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { > + pmu->version = 2; > + entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0); No need for the intermediate "entry". > + ebx.full = entry->ebx; Oof, at first glance this looks like a potential null-pointer deref bug. I believe we can do /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest * CPUID entry is guaranteed to be non-NULL. */ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; > + pmu->nr_arch_gp_counters = min_t(unsigned int, > + ebx.split.num_core_pmc, > + kvm_pmu_cap.num_counters_gp); > + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { > pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; This needs to be sanitized, no? E.g. if KVM only has access to 4 counters, but userspace sets X86_FEATURE_PERFCTR_CORE anyways. Hrm, unless I'm missing something, that's a pre-existing bug. If I'm right, can you add a patch to cap nr_arch_gp_counters at kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block? Then there is no change needed in this patch, e.g. we'll naturally end up with: union cpuid_0x80000022_ebx ebx; pmu->version = 1; if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { pmu->version = 2; /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest * CPUID entry is guaranteed to be non-NULL. */ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; } pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, kvm_pmu_cap.num_counters_gp);