Use a per-CPU pointer to track perf's guest callbacks so that KVM can set the callbacks more precisely and avoid a lurking NULL pointer dereference. On x86, KVM supports being built as a module and thus can be unloaded. And because the shared callbacks are referenced from IRQ/NMI context, unloading KVM can run concurrently with perf, and thus all of perf's checks for a NULL perf_guest_cbs are flawed as perf_guest_cbs could be nullified between the check and dereference. In practice, this has not been problematic because the callbacks are always guarded with a "perf_guest_cbs && perf_guest_cbs->is_in_guest()" pattern, and it's extremely unlikely the compiler will choost to reload perf_guest_cbs in that particular sequence. Because is_in_guest() is obviously true only when KVM is running a guest, perf always wins the race to the guarded code (which does often reload perf_guest_cbs) as KVM has to stop running all guests and do a heavy teardown before unloading. Cc: Zhu Lingshan <lingshan.zhu@xxxxxxxxx> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> --- arch/arm64/kernel/perf_callchain.c | 18 ++++++++++++------ arch/x86/events/core.c | 17 +++++++++++------ arch/x86/events/intel/core.c | 8 +++++--- include/linux/perf_event.h | 2 +- kernel/events/core.c | 12 +++++++++--- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 4a72c2727309..38555275c6a2 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -102,7 +102,9 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); + + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -147,9 +149,10 @@ static bool callchain_trace(void *data, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); struct stackframe frame; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -160,18 +163,21 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1eb45139fcc6..34155a52e498 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2761,10 +2761,11 @@ static bool perf_hw_regs(struct pt_regs *regs) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); struct unwind_state state; unsigned long addr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2864,10 +2865,11 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); struct stack_frame frame; const struct stack_frame __user *fp; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2944,18 +2946,21 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = this_cpu_read(perf_guest_cbs); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fca7a6e2242f..96001962c24d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2784,6 +2784,7 @@ static void intel_pmu_reset(void) static int handle_pmi_common(struct pt_regs *regs, u64 status) { + struct perf_guest_info_callbacks *guest_cbs; struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; @@ -2852,9 +2853,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && - perf_guest_cbs->handle_intel_pt_intr)) - perf_guest_cbs->handle_intel_pt_intr(); + guest_cbs = this_cpu_read(perf_guest_cbs); + if (unlikely(guest_cbs && guest_cbs->is_in_guest() && + guest_cbs->handle_intel_pt_intr)) + guest_cbs->handle_intel_pt_intr(); else intel_pt_interrupt(); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5eab690622ca..c98253dae037 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1237,7 +1237,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, u16 flags); #ifdef CONFIG_HAVE_GUEST_PERF_EVENTS -extern struct perf_guest_info_callbacks *perf_guest_cbs; +DECLARE_PER_CPU(struct perf_guest_info_callbacks *, perf_guest_cbs); extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_unregister_guest_info_callbacks(void); #endif /* CONFIG_HAVE_GUEST_PERF_EVENTS */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 9820df7ee455..9bc1375d6ed9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6483,17 +6483,23 @@ static void perf_pending_event(struct irq_work *entry) } #ifdef CONFIG_HAVE_GUEST_PERF_EVENTS -struct perf_guest_info_callbacks *perf_guest_cbs; +DEFINE_PER_CPU(struct perf_guest_info_callbacks *, perf_guest_cbs); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(perf_guest_cbs, cpu) = cbs; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); void perf_unregister_guest_info_callbacks(void) { - perf_guest_cbs = NULL; + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(perf_guest_cbs, cpu) = NULL; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif -- 2.33.0.259.gc128427fd7-goog