On Thu, Sep 02, 2021 at 09:57:04AM -0700, Song Liu wrote: > +static int > +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) > +{ > + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); > + > + intel_pmu_disable_all(); > + intel_pmu_lbr_read(); > + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr); > + > + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); > + intel_pmu_enable_all(0); > + return cnt; > +} Would something like the below help get rid of that memcpy() ? (compile tested only) --- diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7011e87be6d0..c508ba6099d2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2938,7 +2938,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: - intel_pmu_lbr_read(); + intel_pmu_lbr_read(&cpuc->lbr_stack); intel_pmu_ack_status(status); if (++loops > 100) { static bool warned; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 9e6d6eaeb4cb..0a5bacba86c2 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -170,7 +170,7 @@ enum { #define ARCH_LBR_CTL_MASK 0x7f000e -static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs); static __always_inline bool is_lbr_call_stack_bit_set(u64 config) { @@ -783,7 +783,7 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); @@ -801,18 +801,18 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + bs->entries[i].from = msr_lastbranch.from; + bs->entries[i].to = msr_lastbranch.to; + bs->entries[i].mispred = 0; + bs->entries[i].predicted= 0; + bs->entries[i].in_tx = 0; + bs->entries[i].abort = 0; + bs->entries[i].cycles = 0; + bs->entries[i].type = 0; + bs->entries[i].reserved = 0; } - cpuc->lbr_stack.nr = i; - cpuc->lbr_stack.hw_idx = tos; + bs->nr = i; + bs->hw_idx = tos; } /* @@ -820,7 +820,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; @@ -896,19 +896,19 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + bs->entries[out].from = from; + bs->entries[out].to = to; + bs->entries[out].mispred = mis; + bs->entries[out].predicted = pred; + bs->entries[out].in_tx = in_tx; + bs->entries[out].abort = abort; + bs->entries[out].cycles = cycles; + bs->entries[out].type = 0; + bs->entries[out].reserved = 0; out++; } - cpuc->lbr_stack.nr = out; - cpuc->lbr_stack.hw_idx = tos; + bs->nr = out; + bs->hw_idx = tos; } static __always_inline int get_lbr_br_type(u64 info) @@ -945,7 +945,8 @@ static __always_inline u16 get_lbr_cycles(u64 info) } static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, - struct lbr_entry *entries) + struct lbr_entry *entries, + struct perf_branch_stack *bs) { struct perf_branch_entry *e; struct lbr_entry *lbr; @@ -954,7 +955,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr = entries ? &entries[i] : NULL; - e = &cpuc->lbr_entries[i]; + e = &bs->entries[i]; from = rdlbr_from(i, lbr); /* @@ -977,28 +978,28 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->reserved = 0; } - cpuc->lbr_stack.nr = i; + bs->nr = i; } -static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs) { - intel_pmu_store_lbr(cpuc, NULL); + intel_pmu_store_lbr(cpuc, NULL, bs); } -static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) +static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs) { struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; if (!xsave) { - intel_pmu_store_lbr(cpuc, NULL); + intel_pmu_store_lbr(cpuc, NULL, bs); return; } xsaves(&xsave->xsave, XFEATURE_MASK_LBR); - intel_pmu_store_lbr(cpuc, xsave->lbr.entries); + intel_pmu_store_lbr(cpuc, xsave->lbr.entries, bs); } -void intel_pmu_lbr_read(void) +void intel_pmu_lbr_read(struct perf_branch_stack *bs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1012,9 +1013,8 @@ void intel_pmu_lbr_read(void) cpuc->lbr_users == cpuc->lbr_pebs_users) return; - x86_pmu.lbr_read(cpuc); - - intel_pmu_lbr_filter(cpuc); + x86_pmu.lbr_read(cpuc, bs); + intel_pmu_lbr_filter(cpuc, bs); } /* @@ -1402,7 +1402,7 @@ static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { * in PERF_SAMPLE_BRANCH_STACK sample may vary. */ static void -intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs) { u64 from, to; int br_sel = cpuc->br_sel; @@ -1414,11 +1414,11 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; - for (i = 0; i < cpuc->lbr_stack.nr; i++) { + for (i = 0; i < bs->nr; i++) { - from = cpuc->lbr_entries[i].from; - to = cpuc->lbr_entries[i].to; - type = cpuc->lbr_entries[i].type; + from = bs->entries[i].from; + to = bs->entries[i].to; + type = bs->entries[i].type; /* * Parse the branch type recorded in LBR_x_INFO MSR. @@ -1430,9 +1430,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; type = arch_lbr_br_type_map[type] | to_plm; } else - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + type = branch_type(from, to, bs->entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { - if (cpuc->lbr_entries[i].in_tx) + if (bs->entries[i].in_tx) type |= X86_BR_IN_TX; else type |= X86_BR_NO_TX; @@ -1440,25 +1440,25 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* if type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { - cpuc->lbr_entries[i].from = 0; + bs->entries[i].from = 0; compress = true; } if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) - cpuc->lbr_entries[i].type = common_branch_type(type); + bs->entries[i].type = common_branch_type(type); } if (!compress) return; /* remove all entries with from=0 */ - for (i = 0; i < cpuc->lbr_stack.nr; ) { - if (!cpuc->lbr_entries[i].from) { + for (i = 0; i < bs->nr; ) { + if (!bs->entries[i].from) { j = i; - while (++j < cpuc->lbr_stack.nr) - cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; - cpuc->lbr_stack.nr--; - if (!cpuc->lbr_entries[i].from) + while (++j < bs->nr) + bs->entries[j-1] = bs->entries[j]; + bs->nr--; + if (!bs->entries[i].from) continue; } i++; @@ -1476,8 +1476,8 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - intel_pmu_store_lbr(cpuc, lbr); - intel_pmu_lbr_filter(cpuc); + intel_pmu_store_lbr(cpuc, lbr, &cpuc->lbr_stack); + intel_pmu_lbr_filter(cpuc, &cpuc->lbr_stack); } /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e3ac05c97b5e..9700fb1f47c3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -852,7 +852,7 @@ struct x86_pmu { unsigned int lbr_br_type:1; void (*lbr_reset)(void); - void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_read)(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs); void (*lbr_save)(void *ctx); void (*lbr_restore)(void *ctx); @@ -1345,11 +1345,11 @@ void intel_pmu_lbr_enable_all(bool pmi); void intel_pmu_lbr_disable_all(void); -void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read(struct perf_branch_stack *bs); -void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs); -void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc, struct perf_branch_stack *bs); void intel_pmu_lbr_save(void *ctx);