On some boots the read of MSR_PP1_ENERGY_STATUS msr returns 0, causing perf_msr_probe() to make the power/events/energy-gpu event non-visible. When that happens, the msr always read 0 until the graphics module (i915 for Meteor Lake, xe for Lunar Lake) is loaded. Then it starts returning something different and re-loading the rapl module "fixes" it. This is tested on the following platforms with the fail rates before this patch: Alder Lake S 0/20 Arrow Lake H 0/20 Lunar Lake M 8/20 Meteor Lake U 6/20 Raptor Lake P 4/20 Raptor Lake S 0/20 For those platforms failing, use a separate msr list with .no_check set so it doesn't check the runtime value to create the event - it will just return 0 until the i915/xe module initializes the GPU. The issue https://github.com/ulissesf/qmassa/issues/4 is workarounded by reading the MSR directly since it works after xe is loaded, but the issue with not having the perf event is still there. Closes: https://github.com/ulissesf/qmassa/issues/4 Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4241 Signed-off-by: Lucas De Marchi <lucas.demarchi@xxxxxxxxx --- Maybe a clearer alternative is to just move all the platforms after RAPTORLAKE with a gpu to use the new msr list. arch/x86/events/rapl.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 4952faf03e82d..18e324b8fa82c 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -588,6 +588,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK }, }; +static struct perf_msr intel_rapl_mtl_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, true, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, +}; + /* * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) @@ -826,6 +834,16 @@ static struct rapl_model model_spr = { .rapl_pkg_msrs = intel_rapl_spr_msrs, }; +static struct rapl_model model_rpl = { + .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_pkg_msrs = intel_rapl_mtl_msrs, +}; + static struct rapl_model model_amd_hygon = { .pkg_events = BIT(PERF_RAPL_PKG), .core_events = BIT(PERF_RAPL_CORE), @@ -873,13 +891,13 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr), X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_rpl), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl), - X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl), - X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), + X86_MATCH_VFM(INTEL_METEORLAKE, &model_rpl), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_rpl), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), - X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_rpl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); -- 2.48.1